diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index e793dda3823..8c2226bfa98 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.04-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda11.8-conda" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index ba959f2bc27..a0c2e65b337 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,17 +5,17 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.04-cpp-cuda11.8-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda11.8-pip" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.8-conda/devcontainer.json b/.devcontainer/cuda12.8-conda/devcontainer.json index 5a544e26145..3977a1c5f86 100644 --- a/.devcontainer/cuda12.8-conda/devcontainer.json +++ b/.devcontainer/cuda12.8-conda/devcontainer.json @@ -5,45 +5,55 @@ "args": { "CUDA": "12.8", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.04-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.8-conda" ], - "hostRequirements": {"gpu": "optional"}, + "hostRequirements": { + "gpu": "optional" + }, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": { - "version": "12.8", - "installCompilers": false, - "installProfilers": true, - "installDevPackages": false, - "installcuDNN": false, - "installcuTensor": false, - "installNCCL": false, - "installCUDARuntime": false, - "installNVRTC": false, - "installOpenCL": false, - "installcuBLAS": false, - "installcuSPARSE": false, - "installcuFFT": false, - "installcuFile": false, - "installcuRAND": false, - "installcuSOLVER": false, - "installNPP": false, - "installnvJPEG": false, - "pruneStaticLibs": true - }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {} + "ghcr.io/rapidsai/devcontainers/features/cuda:25.4": { + "version": "12.8", + "installCompilers": false, + "installProfilers": true, + "installDevPackages": false, + "installcuDNN": false, + "installcuTensor": false, + "installNCCL": false, + "installCUDARuntime": false, + "installNVRTC": false, + "installOpenCL": false, + "installcuBLAS": false, + "installcuSPARSE": false, + "installcuFFT": false, + "installcuFile": false, + "installcuRAND": false, + "installcuSOLVER": false, + "installNPP": false, + "installnvJPEG": false, + "pruneStaticLibs": true + }, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"], - "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" + ], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", "mounts": [ diff --git a/.devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda12.8-pip/devcontainer.json index e9ffa620bb0..ab83fa88300 100644 --- a/.devcontainer/cuda12.8-pip/devcontainer.json +++ b/.devcontainer/cuda12.8-pip/devcontainer.json @@ -5,23 +5,33 @@ "args": { "CUDA": "12.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.04-cpp-cuda12.8-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.04-cuda12.8-pip" ], - "hostRequirements": {"gpu": "optional"}, + "hostRequirements": { + "gpu": "optional" + }, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.4": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"], - "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" + ], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", "mounts": [ diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f0be4cb509d..9bcd3a65a9d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,19 +57,19 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" run_script: "ci/build_docs.sh" sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@nvks-runners with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index bd5e87a0a0b..8730ae43ddf 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 6df5019b2aa..34fcbc14420 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -42,7 +42,7 @@ jobs: - pandas-tests-diff - telemetry-setup secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@nvks-runners if: always() with: needs: ${{ toJSON(needs) }} @@ -70,7 +70,7 @@ jobs: changed-files: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@nvks-runners with: files_yaml: | test_cpp: @@ -123,48 +123,48 @@ jobs: checks: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@nvks-runners with: enable_check_generated_files: false ignored_pr_jobs: "telemetry-summarize" conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners with: build_type: pull-request node_type: "cpu16" cpp-linters: secrets: inherit needs: checks - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: pull-request run_script: "ci/cpp_linters.sh" conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@nvks-runners with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@nvks-runners with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -173,7 +173,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -181,18 +181,18 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -202,28 +202,28 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: pull-request - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -233,21 +233,21 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -255,7 +255,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -264,7 +264,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -274,7 +274,7 @@ jobs: cudf-polars-polars-tests: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -283,7 +283,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -292,7 +292,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -302,7 +302,7 @@ jobs: devcontainer: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@nvks-runners with: node_type: "cpu32" arch: '["amd64"]' @@ -314,7 +314,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -325,7 +325,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -337,7 +337,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: node_type: "cpu4" build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 8ad6448bc27..b1bd2d4e768 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@nvks-runners if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@nvks-runners if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@nvks-runners if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -79,7 +79,7 @@ jobs: update-release: # This job sets the PR and its linked issues to the release they are targeting - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@nvks-runners if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: [get-project-id, process-branch-name] with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index d909fd4a657..b6b2caddeb8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,19 +33,19 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" cpp-linters: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -77,7 +77,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -86,31 +86,31 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -119,7 +119,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -128,7 +128,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -137,19 +137,19 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} - node_type: "gpu-v100-latest-1" + node_type: "gpu-l4-latest-1" container_image: "rapidsai/ci-conda:latest" run_script: | ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml wheel-tests-cudf-polars: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} @@ -158,7 +158,7 @@ jobs: script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 07f0f83cc92..7b5b4810fb6 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda-12.8.0 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@nvks-runners with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/README.md b/README.md index 4e917e15b4e..a240d6c2aa9 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=25.02 python=3.12 cuda-version=12.8 + cudf=25.04 python=3.12 cuda-version=12.8 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 72eefaf7c79..b922658ff3f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -25.02.00 +25.04.00 diff --git a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh index ce3291b864a..ed564a39745 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh @@ -13,6 +13,7 @@ runtest() { local lib=$1 local mode=$2 + echo "Running tests for $lib in $mode mode" local plugin="" if [ "$mode" = "cudf" ]; then plugin="-p cudf.pandas" diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index 43ed3594917..cf0a16fb3cb 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -11,7 +11,6 @@ extract_lib_from_dependencies_yaml() { # Parse all keys in dependencies.yaml under the "files" section, # extract all the keys that start with "test_", and extract the rest extracted_libs="$(yq -o json "$file" | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')" - local extracted_libs echo "$extracted_libs" } @@ -28,7 +27,7 @@ main() { lib=$(echo "$lib" | tr -d '""') echo "Running tests for library $lib" - CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi) + CUDA_VERSION=$(if [ "$lib" = "tensorflow" ]; then echo "11.8"; else echo "${RAPIDS_CUDA_VERSION%.*}"; fi) . /opt/conda/etc/profile.d/conda.sh @@ -37,7 +36,7 @@ main() { --config "$dependencies_yaml" \ --output conda \ --file-key "test_${lib}" \ - --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + --matrix "cuda=${CUDA_VERSION};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml rapids-mamba-retry env create --yes -f env.yaml -n test diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cc01f5286ef..190533abc51 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -26,7 +26,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==25.2.*,>=0.0.0a0 +- dask-cuda==25.4.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -42,9 +42,9 @@ dependencies: - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==25.2.*,>=0.0.0a0 +- libkvikio==25.4.*,>=0.0.0a0 - librdkafka>=2.5.0,<2.6.0a0 -- librmm==25.2.*,>=0.0.0a0 +- librmm==25.4.*,>=0.0.0a0 - make - mmh3 - moto>=4.0.8 @@ -82,9 +82,9 @@ dependencies: - python-xxhash - python>=3.10,<3.13 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==25.2.*,>=0.0.0a0 +- rapids-dask-dependency==25.4.*,>=0.0.0a0 - rich -- rmm==25.2.*,>=0.0.0a0 +- rmm==25.4.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml index f4cdbed9be6..e719fd51573 100644 --- a/conda/environments/all_cuda-128_arch-x86_64.yaml +++ b/conda/environments/all_cuda-128_arch-x86_64.yaml @@ -27,7 +27,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==25.2.*,>=0.0.0a0 +- dask-cuda==25.4.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -41,9 +41,9 @@ dependencies: - jupyter_client - libcufile-dev - libcurand-dev -- libkvikio==25.2.*,>=0.0.0a0 +- libkvikio==25.4.*,>=0.0.0a0 - librdkafka>=2.5.0,<2.6.0a0 -- librmm==25.2.*,>=0.0.0a0 +- librmm==25.4.*,>=0.0.0a0 - make - mmh3 - moto>=4.0.8 @@ -81,9 +81,9 @@ dependencies: - python>=3.10,<3.13 - pytorch>=2.4.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==25.2.*,>=0.0.0a0 +- rapids-dask-dependency==25.4.*,>=0.0.0a0 - rich -- rmm==25.2.*,>=0.0.0a0 +- rmm==25.4.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 13e0cf81625..c6c07dbc150 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,4 +12,4 @@ # the License. # ============================================================================= -set(CUDF_TAG branch-25.02) +set(CUDF_TAG branch-25.04) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 3c6194bb721..a9045d460b3 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,12 +171,12 @@ using statistics_type = std::variant; //! Orc I/O interfaces -namespace orc { +namespace orc::detail { // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects, // returned from `read_parsed_orc_statistics`, are constructed from -// `cudf::io::orc::column_statistics` objects that `ProtobufReader` initializes. +// `cudf::io::orc::detail::column_statistics` objects that `ProtobufReader` initializes. struct column_statistics; -} // namespace orc +} // namespace orc::detail /** * @brief Contains per-column ORC statistics. @@ -194,7 +194,7 @@ struct column_statistics { * * @param detail_statistics The statistics to initialize the object with */ - column_statistics(orc::column_statistics&& detail_statistics); + column_statistics(orc::detail::column_statistics&& detail_statistics); }; /** diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index a34881942ce..9e171a62f78 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -277,13 +277,24 @@ struct column_name_info { struct table_metadata { std::vector schema_info; //!< Detailed name information for the entire output hierarchy - std::vector num_rows_per_source; //!< Number of rows read from each data source. + std::vector num_rows_per_source; //!< Number of rows read from each data source //!< Currently only computed for Parquet readers if no - //!< AST filters being used. Empty vector otherwise. + //!< AST filters being used. Empty vector otherwise std::map user_data; //!< Format-dependent metadata of the first input //!< file as key-values pairs (deprecated) std::vector> per_file_user_data; //!< Per file format-dependent metadata as key-values pairs + + // The following variables are currently only computed for Parquet reader + size_type num_input_row_groups{0}; //!< Total number of input row groups across all data sources + std::optional + num_row_groups_after_stats_filter; //!< Number of remaining row groups after stats filter. + //!< std::nullopt if no filtering done. Currently only + //!< reported by Parquet readers + std::optional + num_row_groups_after_bloom_filter; //!< Number of remaining row groups after bloom filter. + //!< std::nullopt if no filtering done. Currently only + //!< reported by Parquet readers }; /** diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 046e9745a71..6dcd84d3f4d 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -139,6 +139,7 @@ enum class unary_operator : int32_t { RINT, ///< Rounds the floating-point argument arg to an integer value BIT_INVERT, ///< Bitwise Not (~) NOT, ///< Logical Not (!) + NEGATE, ///< Unary negation (-), only for signed numeric and duration types. }; /** diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 0f4bde204fa..168beb7fa9e 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -94,6 +94,8 @@ constexpr inline bool has_common_type_v = detail::has_common_type_impl using is_timestamp_t = cuda::std::disjunction, + std::is_same, + std::is_same, std::is_same, std::is_same, std::is_same, @@ -102,6 +104,8 @@ using is_timestamp_t = cuda::std::disjunction /// Checks if a type is a duration type. template using is_duration_t = cuda::std::disjunction, + std::is_same, + std::is_same, std::is_same, std::is_same, std::is_same, diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index ac81dd421fa..7c191b03350 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -33,6 +33,49 @@ namespace cudf::io::detail::nvcomp { namespace { +[[nodiscard]] std::string nvcomp_status_to_string(nvcompStatus_t status) +{ + switch (status) { + case nvcompStatus_t::nvcompSuccess: return "nvcompSuccess"; + case nvcompStatus_t::nvcompErrorInvalidValue: return "nvcompErrorInvalidValue"; + case nvcompStatus_t::nvcompErrorNotSupported: return "nvcompErrorNotSupported"; + case nvcompStatus_t::nvcompErrorCannotDecompress: return "nvcompErrorCannotDecompress"; + case nvcompStatus_t::nvcompErrorBadChecksum: return "nvcompErrorBadChecksum"; + case nvcompStatus_t::nvcompErrorCannotVerifyChecksums: + return "nvcompErrorCannotVerifyChecksums"; + case nvcompStatus_t::nvcompErrorOutputBufferTooSmall: return "nvcompErrorOutputBufferTooSmall"; + case nvcompStatus_t::nvcompErrorWrongHeaderLength: return "nvcompErrorWrongHeaderLength"; + case nvcompStatus_t::nvcompErrorAlignment: return "nvcompErrorAlignment"; + case nvcompStatus_t::nvcompErrorChunkSizeTooLarge: return "nvcompErrorChunkSizeTooLarge"; + case nvcompStatus_t::nvcompErrorCudaError: return "nvcompErrorCudaError"; + case nvcompStatus_t::nvcompErrorInternal: return "nvcompErrorInternal"; + } + return "nvcompStatus_t(" + std::to_string(static_cast(status)) + ")"; +} + +[[nodiscard]] std::string compression_type_name(compression_type compression) +{ + switch (compression) { + case compression_type::SNAPPY: return "Snappy"; + case compression_type::ZSTD: return "Zstandard"; + case compression_type::DEFLATE: return "Deflate"; + case compression_type::LZ4: return "LZ4"; + case compression_type::GZIP: return "GZIP"; + } + return "compression_type(" + std::to_string(static_cast(compression)) + ")"; +} + +#define CHECK_NVCOMP_STATUS(status) \ + do { \ + CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess, \ + "nvCOMP error: " + nvcomp_status_to_string(status)); \ + } while (0) + +#define UNSUPPORTED_COMPRESSION(compression) \ + do { \ + CUDF_FAIL("Unsupported compression type: " + compression_type_name(compression)); \ + } while (0) + // Dispatcher for nvcompBatchedDecompressGetTempSizeEx template auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args) @@ -48,7 +91,7 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward(args)...); case compression_type::GZIP: return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward(args)...); - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } } @@ -66,22 +109,10 @@ auto batched_decompress_async(compression_type compression, Args&&... args) case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward(args)...); case compression_type::GZIP: return nvcompBatchedGzipDecompressAsync(std::forward(args)...); - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } } -[[maybe_unused]] std::string compression_type_name(compression_type compression) -{ - switch (compression) { - case compression_type::SNAPPY: return "Snappy"; - case compression_type::ZSTD: return "Zstandard"; - case compression_type::DEFLATE: return "Deflate"; - case compression_type::LZ4: return "LZ4"; - case compression_type::GZIP: return "GZIP"; - } - return "compression_type(" + std::to_string(static_cast(compression)) + ")"; -} - size_t batched_compress_temp_size(compression_type compression, size_t batch_size, size_t max_uncompressed_chunk_bytes, @@ -118,11 +149,9 @@ size_t batched_compress_temp_size(compression_type compression, &temp_size, max_total_uncompressed_bytes); break; - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } - - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, - "Unable to get scratch size for compression"); + CHECK_NVCOMP_STATUS(nvcomp_status); return temp_size; } @@ -188,9 +217,9 @@ void batched_compress_async(compression_type compression, nvcompBatchedLZ4DefaultOpts, stream.value()); break; - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in compression"); + CHECK_NVCOMP_STATUS(nvcomp_status); } bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept @@ -254,9 +283,7 @@ size_t batched_decompress_temp_size(compression_type compression, size_t temp_size = 0; nvcompStatus_t const nvcomp_status = batched_decompress_get_temp_size_ex( compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size); - - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, - "Unable to get scratch size for decompression"); + CHECK_NVCOMP_STATUS(nvcomp_status); return temp_size; } @@ -289,7 +316,7 @@ void batched_decompress(compression_type compression, nvcomp_args.output_data_ptrs.data(), nvcomp_statuses.data(), stream.value()); - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression"); + CHECK_NVCOMP_STATUS(nvcomp_status); update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream); } @@ -321,11 +348,9 @@ size_t compress_max_output_chunk_size(compression_type compression, status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize( capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size); break; - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } - - CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess, - "failed to get max uncompressed chunk size"); + CHECK_NVCOMP_STATUS(status); return max_comp_chunk_size; } @@ -463,7 +488,7 @@ size_t required_alignment(compression_type compression) case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment; case compression_type::ZSTD: return nvcompZstdRequiredAlignment; case compression_type::LZ4: return nvcompLZ4RequiredAlignment; - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } } @@ -474,7 +499,7 @@ std::optional compress_max_allowed_chunk_size(compression_type compressi case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize; case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize; case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize; - default: CUDF_FAIL("Unsupported compression type"); + default: UNSUPPORTED_COMPRESSION(compression); } } diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 69fd4068712..0d5bb8ac191 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -292,7 +292,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info, CUDF_FAIL("Unsupported source type"); } - orc::metadata const metadata(source.get(), stream); + orc::detail::metadata const metadata(source.get(), stream); // Initialize statistics to return raw_orc_statistics result; @@ -318,7 +318,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info, return result; } -column_statistics::column_statistics(orc::column_statistics&& cs) +column_statistics::column_statistics(orc::detail::column_statistics&& cs) { number_of_values = cs.number_of_values; has_null = cs.has_null; @@ -350,9 +350,9 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info, result.column_names = raw_stats.column_names; auto parse_column_statistics = [](auto const& raw_col_stats) { - orc::column_statistics stats_internal; - orc::ProtobufReader(reinterpret_cast(raw_col_stats.c_str()), - raw_col_stats.size()) + orc::detail::column_statistics stats_internal; + orc::detail::ProtobufReader(reinterpret_cast(raw_col_stats.c_str()), + raw_col_stats.size()) .read(stats_internal); return column_statistics(std::move(stats_internal)); }; @@ -373,7 +373,7 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info, return result; } namespace { -orc_column_schema make_orc_column_schema(host_span orc_schema, +orc_column_schema make_orc_column_schema(host_span orc_schema, uint32_t column_id, std::string column_name) { @@ -400,7 +400,7 @@ orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_vie auto sources = make_datasources(src_info); CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - auto const footer = orc::metadata(sources.front().get(), stream).ff; + auto const footer = orc::detail::metadata(sources.front().get(), stream).ff; return {{make_orc_column_schema(footer.types, 0, "")}, footer.numberOfRows, diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index aeaa87e2202..be3c90a3e24 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -24,7 +24,7 @@ #include -namespace cudf::io::orc::gpu { +namespace cudf::io::orc::detail { /** * @brief Counts the number of characters in each rowgroup of each string column. @@ -266,4 +266,4 @@ void get_dictionary_indices(device_2dspan dictionaries, <<>>(dictionaries, columns); } -} // namespace cudf::io::orc::gpu +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 7046b3b3f91..7ae32f3e8f8 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include "orc.hpp" +#include "io/comp/io_uncomp.hpp" #include "orc_field_reader.hpp" #include "orc_field_writer.hpp" @@ -25,7 +26,7 @@ #include -namespace cudf::io::orc { +namespace cudf::io::orc::detail { namespace { [[nodiscard]] constexpr uint32_t varint_size(uint64_t val) @@ -496,7 +497,7 @@ metadata::metadata(datasource* const src, rmm::cuda_stream_view stream) : source buffer = source->host_read(len - ps_length - 1 - ps.footerLength - ps.metadataLength, ps.metadataLength); auto const md_data = decompressor->decompress_blocks({buffer->data(), buffer->size()}, stream); - orc::ProtobufReader(md_data.data(), md_data.size()).read(md); + ProtobufReader(md_data.data(), md_data.size()).read(md); init_parent_descriptors(); init_column_names(); @@ -546,4 +547,4 @@ void metadata::init_parent_descriptors() } } -} // namespace cudf::io::orc +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 8dccf65ef10..49652c9a0d2 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -16,8 +16,6 @@ #pragma once -#include "io/comp/io_uncomp.hpp" - #include #include #include @@ -34,9 +32,7 @@ #include #include -namespace cudf { -namespace io { -namespace orc { +namespace cudf::io::orc::detail { static constexpr uint32_t block_header_size = 3; // Seconds from January 1st, 1970 to January 1st, 2015 @@ -710,6 +706,4 @@ struct rowgroup_rows { [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; } }; -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp index 3689e4d958b..797db239538 100644 --- a/cpp/src/io/orc/orc_field_reader.hpp +++ b/cpp/src/io/orc/orc_field_reader.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ #include "orc.hpp" -#include +#include /** * @file orc_field_reader.hpp @@ -25,9 +25,7 @@ * ProtobufReader::read(...) functions */ -namespace cudf { -namespace io { -namespace orc { +namespace cudf::io::orc::detail { /** * @brief Functor to run an operator for a specified field. @@ -90,6 +88,4 @@ inline void ProtobufReader::function_builder(T& s, size_t maxlen, std::tuple @@ -33,10 +32,7 @@ #include -namespace cudf { -namespace io { -namespace orc { -namespace gpu { +namespace cudf::io::orc::detail { using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; @@ -65,9 +61,7 @@ auto constexpr VALUE_SENTINEL = size_type{-1}; struct CompressedStreamInfo { CompressedStreamInfo() = default; explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_) - : compressed_data(compressed_data_), - uncompressed_data(nullptr), - compressed_data_size(compressed_size_) + : compressed_data(compressed_data_), compressed_data_size(compressed_size_) { } uint8_t const* compressed_data{}; // [in] base ptr to compressed stream data @@ -500,7 +494,4 @@ void reduce_pushdown_masks(device_span orc_columns device_2dspan set_counts, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 726c79bd004..f19fb3c81d8 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "io/comp/gpuinflate.hpp" #include "io/orc/reader_impl.hpp" #include "io/orc/reader_impl_chunking.hpp" #include "io/orc/reader_impl_helpers.hpp" @@ -40,16 +39,16 @@ namespace cudf::io::orc::detail { std::size_t gather_stream_info_and_column_desc( std::size_t stripe_id, std::size_t level, - orc::StripeInformation const* stripeinfo, - orc::StripeFooter const* stripefooter, + StripeInformation const* stripeinfo, + StripeFooter const* stripefooter, host_span orc2gdf, - host_span types, + host_span types, bool use_index, bool apply_struct_map, int64_t* num_dictionary_entries, std::size_t* local_stream_order, std::vector* stream_info, - cudf::detail::hostdevice_2dvector* chunks) + cudf::detail::hostdevice_2dvector* chunks) { CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr), "Either stream_info or chunks must be provided, but not both."); @@ -57,17 +56,17 @@ std::size_t gather_stream_info_and_column_desc( std::size_t src_offset = 0; std::size_t dst_offset = 0; - auto const get_stream_index_type = [](orc::StreamKind kind) { + auto const get_stream_index_type = [](StreamKind kind) { switch (kind) { - case orc::DATA: return gpu::CI_DATA; - case orc::LENGTH: - case orc::SECONDARY: return gpu::CI_DATA2; - case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY; - case orc::PRESENT: return gpu::CI_PRESENT; - case orc::ROW_INDEX: return gpu::CI_INDEX; + case DATA: return CI_DATA; + case LENGTH: + case SECONDARY: return CI_DATA2; + case DICTIONARY_DATA: return CI_DICTIONARY; + case PRESENT: return CI_PRESENT; + case ROW_INDEX: return CI_INDEX; default: // Skip this stream as it's not strictly required - return gpu::CI_NUM_STREAMS; + return CI_NUM_STREAMS; } }; @@ -87,16 +86,15 @@ std::size_t gather_stream_info_and_column_desc( // for each of its fields. There is only a PRESENT stream, which // needs to be included for the reader. auto const schema_type = types[column_id]; - if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT && - stream.kind == orc::PRESENT) { + if (!schema_type.subtypes.empty() && schema_type.kind == STRUCT && stream.kind == PRESENT) { for (auto const& idx : schema_type.subtypes) { auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1; if (child_idx >= 0) { col = child_idx; if (chunks) { - auto& chunk = (*chunks)[stripe_id][col]; - chunk.strm_id[gpu::CI_PRESENT] = *local_stream_order; - chunk.strm_len[gpu::CI_PRESENT] = stream.length; + auto& chunk = (*chunks)[stripe_id][col]; + chunk.strm_id[CI_PRESENT] = *local_stream_order; + chunk.strm_len[CI_PRESENT] = stream.length; } } } @@ -105,14 +103,14 @@ std::size_t gather_stream_info_and_column_desc( if (chunks) { if (src_offset >= stripeinfo->indexLength || use_index) { auto const index_type = get_stream_index_type(stream.kind); - if (index_type < gpu::CI_NUM_STREAMS) { + if (index_type < CI_NUM_STREAMS) { auto& chunk = (*chunks)[stripe_id][col]; chunk.strm_id[index_type] = *local_stream_order; chunk.strm_len[index_type] = stream.length; // NOTE: skip_count field is temporarily used to track the presence of index streams chunk.skip_count |= 1 << index_type; - if (index_type == gpu::CI_DICTIONARY) { + if (index_type == CI_DICTIONARY) { chunk.dictionary_start = *num_dictionary_entries; chunk.dict_len = stripefooter->columns[column_id].dictionarySize; *num_dictionary_entries += @@ -643,7 +641,7 @@ void reader_impl::load_next_stripe_data(read_mode mode) // memory once. auto hd_compinfo = [&] { std::size_t max_num_streams{0}; - if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + if (_metadata.per_file_metadata[0].ps.compression != NONE) { // Find the maximum number of streams in all levels of the loaded stripes. for (std::size_t level = 0; level < num_levels; ++level) { auto const stream_range = @@ -651,7 +649,7 @@ void reader_impl::load_next_stripe_data(read_mode mode) max_num_streams = std::max(max_num_streams, stream_range.size()); } } - return cudf::detail::hostdevice_vector(max_num_streams, _stream); + return cudf::detail::hostdevice_vector(max_num_streams, _stream); }(); for (std::size_t level = 0; level < num_levels; ++level) { @@ -665,26 +663,26 @@ void reader_impl::load_next_stripe_data(read_mode mode) auto const stream_range = merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range); - if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + if (_metadata.per_file_metadata[0].ps.compression != NONE) { auto const& decompressor = *_metadata.per_file_metadata[0].decompressor; - auto compinfo = cudf::detail::hostdevice_span{hd_compinfo}.subspan( + auto compinfo = cudf::detail::hostdevice_span{hd_compinfo}.subspan( 0, stream_range.size()); for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { auto const& info = stream_info[stream_idx]; auto const dst_base = static_cast(stripe_data[info.source.stripe_idx - stripe_start].data()); compinfo[stream_idx - stream_range.begin] = - gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length); + CompressedStreamInfo(dst_base + info.dst_pos, info.length); } // Estimate the uncompressed data. compinfo.host_to_device_async(_stream); - gpu::ParseCompressedStripeData(compinfo.device_ptr(), - compinfo.size(), - decompressor.GetBlockSize(), - decompressor.GetLog2MaxCompressionRatio(), - _stream); + ParseCompressedStripeData(compinfo.device_ptr(), + compinfo.size(), + decompressor.GetBlockSize(), + decompressor.GetLog2MaxCompressionRatio(), + _stream); compinfo.device_to_host_sync(_stream); for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp index 4ef68ee8d86..cb66edf3c98 100644 --- a/cpp/src/io/orc/reader_impl_chunking.hpp +++ b/cpp/src/io/orc/reader_impl_chunking.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -307,15 +307,15 @@ std::vector find_splits(host_span cumulative_sizes, std::size_t gather_stream_info_and_column_desc( std::size_t stripe_id, std::size_t level, - orc::StripeInformation const* stripeinfo, - orc::StripeFooter const* stripefooter, + StripeInformation const* stripeinfo, + StripeFooter const* stripefooter, host_span orc2gdf, - host_span types, + host_span types, bool use_index, bool apply_struct_map, int64_t* num_dictionary_entries, std::size_t* local_stream_order, std::vector* stream_info, - cudf::detail::hostdevice_2dvector* chunks); + cudf::detail::hostdevice_2dvector* chunks); } // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index b661bb4ff90..586c07cbc16 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,13 +77,13 @@ rmm::device_buffer decompress_stripe_data( range const& loaded_stripe_range, range const& stream_range, std::size_t num_decode_stripes, - cudf::detail::hostdevice_span compinfo, + cudf::detail::hostdevice_span compinfo, stream_source_map const& compinfo_map, OrcDecompressor const& decompressor, host_span stripe_data, host_span stream_info, - cudf::detail::hostdevice_2dvector& chunks, - cudf::detail::hostdevice_2dvector& row_groups, + cudf::detail::hostdevice_2dvector& chunks, + cudf::detail::hostdevice_2dvector& row_groups, size_type row_index_stride, bool use_base_stride, rmm::cuda_stream_view stream) @@ -100,7 +100,7 @@ rmm::device_buffer decompress_stripe_data( auto const& info = stream_info[stream_idx]; auto& stream_comp_info = compinfo[stream_idx - stream_range.begin]; - stream_comp_info = gpu::CompressedStreamInfo( + stream_comp_info = CompressedStreamInfo( static_cast( stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) + info.dst_pos, @@ -120,11 +120,11 @@ rmm::device_buffer decompress_stripe_data( if (!compinfo_ready) { compinfo.host_to_device_async(stream); - gpu::ParseCompressedStripeData(compinfo.device_ptr(), - compinfo.size(), - decompressor.GetBlockSize(), - decompressor.GetLog2MaxCompressionRatio(), - stream); + ParseCompressedStripeData(compinfo.device_ptr(), + compinfo.size(), + decompressor.GetBlockSize(), + decompressor.GetLog2MaxCompressionRatio(), + stream); compinfo.device_to_host_sync(stream); for (std::size_t i = 0; i < compinfo.size(); ++i) { @@ -178,11 +178,11 @@ rmm::device_buffer decompress_stripe_data( } compinfo.host_to_device_async(stream); - gpu::ParseCompressedStripeData(compinfo.device_ptr(), - compinfo.size(), - decompressor.GetBlockSize(), - decompressor.GetLog2MaxCompressionRatio(), - stream); + ParseCompressedStripeData(compinfo.device_ptr(), + compinfo.size(), + decompressor.GetBlockSize(), + decompressor.GetLog2MaxCompressionRatio(), + stream); // Value for checking whether we decompress successfully. // It doesn't need to be atomic as there is no race condition: we only write `true` if needed. @@ -275,7 +275,7 @@ rmm::device_buffer decompress_stripe_data( // Copy without stream sync, thus need to wait for stream sync below to access. any_block_failure.device_to_host_async(stream); - gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream); + PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream); compinfo.device_to_host_sync(stream); // This also sync stream for `any_block_failure`. // We can check on host after stream synchronize @@ -291,7 +291,7 @@ rmm::device_buffer decompress_stripe_data( for (std::size_t i = 0; i < num_decode_stripes; ++i) { for (std::size_t j = 0; j < num_columns; ++j) { auto& chunk = chunks[i][j]; - for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) { + for (int k = 0; k < CI_NUM_STREAMS; ++k) { if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) { chunk.streams[k] = compinfo[chunk.strm_id[k]].uncompressed_data; chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size; @@ -303,14 +303,14 @@ rmm::device_buffer decompress_stripe_data( if (row_groups.size().first) { chunks.host_to_device_async(stream); row_groups.host_to_device_async(stream); - gpu::ParseRowGroupIndex(row_groups.base_device_ptr(), - compinfo.device_ptr(), - chunks.base_device_ptr(), - num_columns, - num_decode_stripes, - row_index_stride, - use_base_stride, - stream); + ParseRowGroupIndex(row_groups.base_device_ptr(), + compinfo.device_ptr(), + chunks.base_device_ptr(), + num_columns, + num_decode_stripes, + row_index_stride, + use_base_stride, + stream); } return decomp_data; @@ -329,7 +329,7 @@ rmm::device_buffer decompress_stripe_data( * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource to use for device memory allocation */ -void update_null_mask(cudf::detail::hostdevice_2dvector& chunks, +void update_null_mask(cudf::detail::hostdevice_2dvector& chunks, host_span out_buffers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -419,8 +419,8 @@ void decode_stream_data(int64_t num_dicts, size_type row_index_stride, std::size_t level, table_device_view const& d_tz_table, - cudf::detail::hostdevice_2dvector& chunks, - cudf::detail::device_2dspan row_groups, + cudf::detail::hostdevice_2dvector& chunks, + cudf::detail::device_2dspan row_groups, std::vector& out_buffers, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -441,10 +441,10 @@ void decode_stream_data(int64_t num_dicts, }); // Allocate global dictionary for deserializing - rmm::device_uvector global_dict(num_dicts, stream); + rmm::device_uvector global_dict(num_dicts, stream); chunks.host_to_device_async(stream); - gpu::DecodeNullsAndStringDictionaries( + DecodeNullsAndStringDictionaries( chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream); if (level > 0) { @@ -453,18 +453,18 @@ void decode_stream_data(int64_t num_dicts, } cudf::detail::device_scalar error_count(0, stream); - gpu::DecodeOrcColumnData(chunks.base_device_ptr(), - global_dict.data(), - row_groups, - num_columns, - num_stripes, - skip_rows, - d_tz_table, - row_groups.size().first, - row_index_stride, - level, - error_count.data(), - stream); + DecodeOrcColumnData(chunks.base_device_ptr(), + global_dict.data(), + row_groups, + num_columns, + num_stripes, + skip_rows, + d_tz_table, + row_groups.size().first, + row_index_stride, + level, + error_count.data(), + stream); chunks.device_to_host_async(stream); // `value` synchronizes auto const num_errors = error_count.value(stream); @@ -485,7 +485,7 @@ void decode_stream_data(int64_t num_dicts, * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current * layer. */ -void scan_null_counts(cudf::detail::hostdevice_2dvector const& chunks, +void scan_null_counts(cudf::detail::hostdevice_2dvector const& chunks, uint32_t* d_prefix_sums, rmm::cuda_stream_view stream) { @@ -531,9 +531,9 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector const& * @brief Aggregate child metadata from parent column chunks. */ void aggregate_child_meta(std::size_t level, - cudf::io::orc::detail::column_hierarchy const& selected_columns, - cudf::detail::host_2dspan chunks, - cudf::detail::host_2dspan row_groups, + column_hierarchy const& selected_columns, + cudf::detail::host_2dspan chunks, + cudf::detail::host_2dspan row_groups, host_span nested_cols, host_span out_buffers, reader_column_meta& col_meta) @@ -766,7 +766,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level. // Unfortunately we cannot create one hostdevice_vector to use for all levels because // currently we do not have a hostdevice_2dspan class. - std::vector> lvl_chunks(num_levels); + std::vector> lvl_chunks(num_levels); // For computing null count. auto null_count_prefix_sums = [&] { @@ -787,7 +787,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) // thus only need to allocate memory once. auto hd_compinfo = [&] { std::size_t max_num_streams{0}; - if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + if (_metadata.per_file_metadata[0].ps.compression != NONE) { // Find the maximum number of streams in all levels of the decoding stripes. for (std::size_t level = 0; level < num_levels; ++level) { auto const stream_range = @@ -795,7 +795,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) max_num_streams = std::max(max_num_streams, stream_range.size()); } } - return cudf::detail::hostdevice_vector{max_num_streams, _stream}; + return cudf::detail::hostdevice_vector{max_num_streams, _stream}; }(); auto& col_meta = *_col_meta; @@ -812,8 +812,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) auto& chunks = lvl_chunks[level]; auto const num_lvl_columns = columns_level.size(); - chunks = - cudf::detail::hostdevice_2dvector(stripe_count, num_lvl_columns, _stream); + chunks = cudf::detail::hostdevice_2dvector(stripe_count, num_lvl_columns, _stream); memset(chunks.base_host_ptr(), 0, chunks.size_bytes()); bool const use_index = @@ -897,7 +896,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) // num_child_rows for a struct column will be same, for other nested types it will be // calculated. - chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows; + chunk.num_child_rows = (chunk.type_kind != STRUCT) ? 0 : chunk.num_rows; chunk.dtype_id = column_types[col_idx].id(); chunk.decimal_scale = _metadata.per_file_metadata[stripe.source_idx] .ff.types[columns_level[col_idx].id] @@ -912,11 +911,11 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) : cudf::size_of(column_types[col_idx]); chunk.num_rowgroups = stripe_num_rowgroups; - if (chunk.type_kind == orc::TIMESTAMP) { + if (chunk.type_kind == TIMESTAMP) { chunk.timestamp_type_id = _options.timestamp_type.id(); } if (not is_stripe_data_empty) { - for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { + for (int k = 0; k < CI_NUM_STREAMS; k++) { chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos; } @@ -931,10 +930,10 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) // Process dataset chunks into output columns. auto row_groups = - cudf::detail::hostdevice_2dvector(num_rowgroups, num_lvl_columns, _stream); + cudf::detail::hostdevice_2dvector(num_rowgroups, num_lvl_columns, _stream); if (level > 0 and row_groups.size().first) { - cudf::host_span row_groups_span(row_groups.base_host_ptr(), - num_rowgroups * num_lvl_columns); + cudf::host_span row_groups_span(row_groups.base_host_ptr(), + num_rowgroups * num_lvl_columns); auto& rw_grp_meta = col_meta.rwgrp_meta; // Update start row and num rows per row group @@ -950,9 +949,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) } // Setup row group descriptors if using indexes. - if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { + if (_metadata.per_file_metadata[0].ps.compression != NONE) { auto const compinfo = - cudf::detail::hostdevice_span{hd_compinfo}.subspan( + cudf::detail::hostdevice_span{hd_compinfo}.subspan( 0, stream_range.size()); auto decomp_data = decompress_stripe_data(load_stripe_range, stream_range, @@ -979,14 +978,14 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) chunks.host_to_device_async(_stream); row_groups.host_to_device_async(_stream); row_groups.host_to_device_async(_stream); - gpu::ParseRowGroupIndex(row_groups.base_device_ptr(), - nullptr, - chunks.base_device_ptr(), - num_lvl_columns, - stripe_count, - _metadata.get_row_index_stride(), - level == 0, - _stream); + ParseRowGroupIndex(row_groups.base_device_ptr(), + nullptr, + chunks.base_device_ptr(), + num_lvl_columns, + stripe_count, + _metadata.get_row_index_stride(), + level == 0, + _stream); } } @@ -995,7 +994,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) for (std::size_t i = 0; i < column_types.size(); ++i) { bool is_nullable = false; for (std::size_t j = 0; j < stripe_count; ++j) { - if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) { + if (chunks[j][i].strm_len[CI_PRESENT] != 0) { is_nullable = true; break; } diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp index 7e5db4b7617..1d4aaaf51ef 100644 --- a/cpp/src/io/orc/reader_impl_helpers.cpp +++ b/cpp/src/io/orc/reader_impl_helpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ std::unique_ptr create_empty_column(size_type orc_col_id, to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id)); switch (kind) { - case orc::LIST: { + case LIST: { schema_info.children.emplace_back("offsets"); schema_info.children.emplace_back(""); return make_lists_column(0, @@ -50,7 +50,7 @@ std::unique_ptr create_empty_column(size_type orc_col_id, rmm::device_buffer{0, stream}, stream); } - case orc::MAP: { + case MAP: { schema_info.children.emplace_back("offsets"); schema_info.children.emplace_back("struct"); auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes; @@ -76,7 +76,7 @@ std::unique_ptr create_empty_column(size_type orc_col_id, stream); } - case orc::STRUCT: { + case STRUCT: { std::vector> child_columns; for (auto const col : metadata.get_col_type(orc_col_id).subtypes) { schema_info.children.emplace_back(""); @@ -92,7 +92,7 @@ std::unique_ptr create_empty_column(size_type orc_col_id, 0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream); } - case orc::DECIMAL: { + case DECIMAL: { int32_t scale = 0; if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) { scale = -static_cast(metadata.get_types()[orc_col_id].scale.value_or(0)); @@ -119,8 +119,8 @@ column_buffer assemble_buffer(size_type orc_col_id, col_buffer.name = metadata.column_name(0, orc_col_id); auto kind = metadata.get_col_type(orc_col_id).kind; switch (kind) { - case orc::LIST: - case orc::STRUCT: { + case LIST: + case STRUCT: { auto const& children_indices = selected_columns.children.at(orc_col_id); for (auto const child_id : children_indices) { col_buffer.children.emplace_back(assemble_buffer( @@ -128,7 +128,7 @@ column_buffer assemble_buffer(size_type orc_col_id, } } break; - case orc::MAP: { + case MAP: { std::vector child_col_buffers; // Get child buffers auto const& children_indices = selected_columns.children.at(orc_col_id); diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp index 4cded30d89b..f2e746b312f 100644 --- a/cpp/src/io/orc/reader_impl_helpers.hpp +++ b/cpp/src/io/orc/reader_impl_helpers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,36 +61,36 @@ struct reader_column_meta { /** * @brief Function that translates ORC data kind to cuDF type enum */ -inline constexpr type_id to_cudf_type(orc::TypeKind kind, +inline constexpr type_id to_cudf_type(TypeKind kind, bool use_np_dtypes, type_id timestamp_type_id, type_id decimal_type_id) { switch (kind) { - case orc::BOOLEAN: return type_id::BOOL8; - case orc::BYTE: return type_id::INT8; - case orc::SHORT: return type_id::INT16; - case orc::INT: return type_id::INT32; - case orc::LONG: return type_id::INT64; - case orc::FLOAT: return type_id::FLOAT32; - case orc::DOUBLE: return type_id::FLOAT64; - case orc::STRING: - case orc::BINARY: - case orc::VARCHAR: - case orc::CHAR: + case BOOLEAN: return type_id::BOOL8; + case BYTE: return type_id::INT8; + case SHORT: return type_id::INT16; + case INT: return type_id::INT32; + case LONG: return type_id::INT64; + case FLOAT: return type_id::FLOAT32; + case DOUBLE: return type_id::FLOAT64; + case STRING: + case BINARY: + case VARCHAR: + case CHAR: // Variable-length types can all be mapped to STRING return type_id::STRING; - case orc::TIMESTAMP: + case TIMESTAMP: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; - case orc::DATE: + case DATE: // There isn't a (DAYS -> np.dtype) mapping return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS; - case orc::DECIMAL: return decimal_type_id; + case DECIMAL: return decimal_type_id; // Need to update once cuDF plans to support map type - case orc::MAP: - case orc::LIST: return type_id::LIST; - case orc::STRUCT: return type_id::STRUCT; + case MAP: + case LIST: return type_id::LIST; + case STRUCT: return type_id::STRUCT; default: break; } diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 5f4c1e0696d..e81c74ae1a6 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -24,7 +24,7 @@ #include -namespace cudf::io::orc::gpu { +namespace cudf::io::orc::detail { using strings::detail::fixed_point_string_size; @@ -502,4 +502,4 @@ void orc_encode_statistics(uint8_t* blob_bfr, blob_bfr, groups, chunks, statistics_count); } -} // namespace cudf::io::orc::gpu +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index f560b806894..c7947b0e4c9 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -15,6 +15,7 @@ */ #include "io/utilities/block_utils.cuh" +#include "io/utilities/column_buffer.hpp" #include "orc_gpu.hpp" #include @@ -23,10 +24,7 @@ #include -namespace cudf { -namespace io { -namespace orc { -namespace gpu { +namespace cudf::io::orc::detail { using cudf::io::detail::string_index_pair; @@ -2096,7 +2094,4 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks, chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count); } -} // namespace gpu -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 857daeb5856..15ce1aadb17 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "io/comp/gpuinflate.hpp" #include "io/utilities/block_utils.cuh" #include "io/utilities/time_utils.cuh" #include "orc_gpu.hpp" @@ -40,10 +39,7 @@ #include #include -namespace cudf { -namespace io { -namespace orc { -namespace gpu { +namespace cudf::io::orc::detail { using cudf::detail::device_2dspan; using cudf::io::detail::compression_result; @@ -1421,7 +1417,4 @@ void decimal_sizes_to_offsets(device_2dspan rg_bounds, <<>>(rg_bounds, d_sizes); } -} // namespace gpu -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 5e23bc5adcc..a72b71a83ca 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -26,10 +26,7 @@ #include #include -namespace cudf { -namespace io { -namespace orc { -namespace gpu { +namespace cudf::io::orc::detail { struct comp_in_out { uint8_t const* in_ptr{}; @@ -605,7 +602,4 @@ void __host__ reduce_pushdown_masks(device_span co <<>>(columns, rowgroups, valid_counts); } -} // namespace gpu -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::detail diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 5c3377a1aeb..ed900105968 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -21,7 +21,6 @@ #include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" -#include "io/utilities/column_utils.cuh" #include "writer_impl.hpp" #include @@ -40,6 +39,7 @@ #include #include #include +#include #include #include @@ -92,15 +92,15 @@ namespace { /** * @brief Translates cuDF compression to ORC compression. */ -orc::CompressionKind to_orc_compression(compression_type compression) +CompressionKind to_orc_compression(compression_type compression) { switch (compression) { case compression_type::AUTO: - case compression_type::SNAPPY: return orc::CompressionKind::SNAPPY; - case compression_type::ZLIB: return orc::CompressionKind::ZLIB; - case compression_type::ZSTD: return orc::CompressionKind::ZSTD; - case compression_type::LZ4: return orc::CompressionKind::LZ4; - case compression_type::NONE: return orc::CompressionKind::NONE; + case compression_type::SNAPPY: return CompressionKind::SNAPPY; + case compression_type::ZLIB: return CompressionKind::ZLIB; + case compression_type::ZSTD: return CompressionKind::ZSTD; + case compression_type::LZ4: return CompressionKind::LZ4; + case compression_type::NONE: return CompressionKind::NONE; default: CUDF_FAIL("Unsupported compression type"); } } @@ -119,7 +119,7 @@ size_t compression_block_size(compression_type compression) /** * @brief Translates cuDF dtype to ORC datatype */ -constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map) +constexpr TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map) { switch (id) { case cudf::type_id::INT8: return TypeKind::BYTE; @@ -237,8 +237,8 @@ class orc_column_view { [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; } void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; } - void attach_stripe_dicts(host_span host_stripe_dicts, - device_span dev_stripe_dicts) + void attach_stripe_dicts(host_span host_stripe_dicts, + device_span dev_stripe_dicts) { stripe_dicts = host_stripe_dicts; d_stripe_dicts = dev_stripe_dicts; @@ -301,8 +301,8 @@ class orc_column_view { host_span rowgroup_char_counts; - host_span stripe_dicts; - device_span d_stripe_dicts; + host_span stripe_dicts; + device_span d_stripe_dicts; // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements // into the output stream. @@ -498,17 +498,17 @@ size_t RLE_stream_size(TypeKind kind, size_t count) case TypeKind::BYTE: return div_rounding_up_unsafe(count, byte_rle_max_len) * (byte_rle_max_len + 1); case TypeKind::SHORT: - return div_rounding_up_unsafe(count, gpu::encode_block_size) * - (gpu::encode_block_size * max_varint_size() + 2); + return div_rounding_up_unsafe(count, encode_block_size) * + (encode_block_size * max_varint_size() + 2); case TypeKind::FLOAT: case TypeKind::INT: case TypeKind::DATE: - return div_rounding_up_unsafe(count, gpu::encode_block_size) * - (gpu::encode_block_size * max_varint_size() + 2); + return div_rounding_up_unsafe(count, encode_block_size) * + (encode_block_size * max_varint_size() + 2); case TypeKind::LONG: case TypeKind::DOUBLE: - return div_rounding_up_unsafe(count, gpu::encode_block_size) * - (gpu::encode_block_size * max_varint_size() + 2); + return div_rounding_up_unsafe(count, encode_block_size) * + (encode_block_size * max_varint_size() + 2); default: CUDF_FAIL("Unsupported ORC type for RLE stream size: " + std::to_string(kind)); } } @@ -536,7 +536,7 @@ orc_streams create_streams(host_span columns, return Stream{ROW_INDEX, col.id()}; }); - std::vector ids(columns.size() * gpu::CI_NUM_STREAMS, -1); + std::vector ids(columns.size() * CI_NUM_STREAMS, -1); std::vector types(streams.size(), INVALID_TYPE_KIND); for (auto& column : columns) { @@ -568,41 +568,39 @@ orc_streams create_streams(host_span columns, auto const kind = column.orc_kind(); auto add_stream = - [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { + [&](StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1; - const auto base = column.index() * gpu::CI_NUM_STREAMS; + const auto base = column.index() * CI_NUM_STREAMS; ids[base + index_type] = streams.size(); - streams.push_back(orc::Stream{ - kind, - column.id(), - (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()}); + streams.push_back( + Stream{kind, + column.id(), + (size == 0) ? 0 : size + max_alignment_padding * segmentation.num_rowgroups()}); types.push_back(type_kind); }; - auto add_RLE_stream = [&]( - gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind) { + auto add_RLE_stream = [&](StreamIndexType index_type, StreamKind kind, TypeKind type_kind) { add_stream(index_type, kind, type_kind, RLE_column_size(type_kind)); }; - if (is_nullable) { add_RLE_stream(gpu::CI_PRESENT, PRESENT, TypeKind::BOOLEAN); } + if (is_nullable) { add_RLE_stream(CI_PRESENT, PRESENT, TypeKind::BOOLEAN); } switch (kind) { case TypeKind::BOOLEAN: case TypeKind::BYTE: - add_RLE_stream(gpu::CI_DATA, DATA, kind); + add_RLE_stream(CI_DATA, DATA, kind); column.set_orc_encoding(DIRECT); break; case TypeKind::SHORT: case TypeKind::INT: case TypeKind::LONG: case TypeKind::DATE: - add_RLE_stream(gpu::CI_DATA, DATA, kind); + add_RLE_stream(CI_DATA, DATA, kind); column.set_orc_encoding(DIRECT_V2); break; case TypeKind::FLOAT: case TypeKind::DOUBLE: // Pass through if no nulls (no RLE encoding for floating point) - add_stream( - gpu::CI_DATA, DATA, kind, (column.null_count() != 0) ? RLE_column_size(kind) : 0); + add_stream(CI_DATA, DATA, kind, (column.null_count() != 0) ? RLE_column_size(kind) : 0); column.set_orc_encoding(DIRECT); break; case TypeKind::STRING: { @@ -632,35 +630,34 @@ orc_streams create_streams(host_span columns, // Decide between direct or dictionary encoding if (enable_dict && dict_data_size < direct_data_size) { - add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::INT); - add_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT, dict_lengths_div512 * (512 * 4 + 2)); - add_stream( - gpu::CI_DICTIONARY, DICTIONARY_DATA, TypeKind::CHAR, std::max(dict_data_size, 1ul)); + add_RLE_stream(CI_DATA, DATA, TypeKind::INT); + add_stream(CI_DATA2, LENGTH, TypeKind::INT, dict_lengths_div512 * (512 * 4 + 2)); + add_stream(CI_DICTIONARY, DICTIONARY_DATA, TypeKind::CHAR, std::max(dict_data_size, 1ul)); column.set_orc_encoding(DICTIONARY_V2); } else { - add_stream(gpu::CI_DATA, DATA, TypeKind::CHAR, std::max(direct_data_size, 1)); - add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT); + add_stream(CI_DATA, DATA, TypeKind::CHAR, std::max(direct_data_size, 1)); + add_RLE_stream(CI_DATA2, LENGTH, TypeKind::INT); column.set_orc_encoding(DIRECT_V2); } break; } case TypeKind::TIMESTAMP: - add_RLE_stream(gpu::CI_DATA, DATA, TypeKind::LONG); - add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::LONG); + add_RLE_stream(CI_DATA, DATA, TypeKind::LONG); + add_RLE_stream(CI_DATA2, SECONDARY, TypeKind::LONG); column.set_orc_encoding(DIRECT_V2); break; case TypeKind::DECIMAL: // varint values (NO RLE) // data_stream_size = decimal_column_sizes.at(column.index()); - add_stream(gpu::CI_DATA, DATA, TypeKind::DECIMAL, decimal_column_sizes.at(column.index())); + add_stream(CI_DATA, DATA, TypeKind::DECIMAL, decimal_column_sizes.at(column.index())); // scale stream TODO: compute exact size since all elems are equal - add_RLE_stream(gpu::CI_DATA2, SECONDARY, TypeKind::INT); + add_RLE_stream(CI_DATA2, SECONDARY, TypeKind::INT); column.set_orc_encoding(DIRECT_V2); break; case TypeKind::LIST: case TypeKind::MAP: // no data stream, only lengths - add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT); + add_RLE_stream(CI_DATA2, LENGTH, TypeKind::INT); column.set_orc_encoding(DIRECT_V2); break; case TypeKind::STRUCT: @@ -683,7 +680,7 @@ std::vector> calculate_aligned_rowgroup_bounds( orc_table.num_columns() * segmentation.num_rowgroups(), stream); auto const d_pd_set_counts = device_2dspan{d_pd_set_counts_data, orc_table.num_columns()}; - gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream); + reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream); auto aligned_rgs = hostdevice_2dvector( segmentation.num_rowgroups(), orc_table.num_columns(), stream); @@ -838,7 +835,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, rmm::cuda_stream_view stream) { auto const num_columns = orc_table.num_columns(); - hostdevice_2dvector chunks(num_columns, segmentation.num_rowgroups(), stream); + hostdevice_2dvector chunks(num_columns, segmentation.num_rowgroups(), stream); auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream); @@ -911,7 +908,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, " Please see https://github.com/rapidsai/cudf/issues/6763 for more information."); } - hostdevice_2dvector chunk_streams( + hostdevice_2dvector chunk_streams( num_columns, segmentation.num_rowgroups(), stream); // per-stripe, per-stream owning buffers std::vector>> encoded_data(segmentation.num_stripes()); @@ -921,10 +918,10 @@ encoded_data encode_columns(orc_table_view const& orc_table, }); for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { - for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) { + for (int strm_type = 0; strm_type < CI_NUM_STREAMS; ++strm_type) { auto const& column = orc_table.column(col_idx); auto col_streams = chunk_streams[col_idx]; - auto const strm_id = streams.id(col_idx * gpu::CI_NUM_STREAMS + strm_type); + auto const strm_id = streams.id(col_idx * CI_NUM_STREAMS + strm_type); std::for_each(stripe.cbegin(), stripe.cend(), [&](auto rg_idx) { col_streams[rg_idx].ids[strm_type] = strm_id; @@ -938,25 +935,25 @@ encoded_data encode_columns(orc_table_view const& orc_table, auto const& ck = chunks[col_idx][rg_idx]; auto& strm = col_streams[rg_idx]; - if ((strm_type == gpu::CI_DICTIONARY) || - (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) { + if ((strm_type == CI_DICTIONARY) || + (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) { if (rg_idx == *stripe.cbegin()) { auto const stripe_dict = column.host_stripe_dict(stripe.id); strm.lengths[strm_type] = - (strm_type == gpu::CI_DICTIONARY) + (strm_type == CI_DICTIONARY) ? stripe_dict.char_count : (((stripe_dict.entry_count + 0x1ff) >> 9) * (512 * 4 + 2)); } else { strm.lengths[strm_type] = 0; } - } else if (strm_type == gpu::CI_DATA && ck.type_kind == TypeKind::STRING && + } else if (strm_type == CI_DATA && ck.type_kind == TypeKind::STRING && ck.encoding_kind == DIRECT_V2) { strm.lengths[strm_type] = std::max(column.rowgroup_char_count(rg_idx), 1); - } else if (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 && + } else if (strm_type == CI_DATA && streams[strm_id].length == 0 && (ck.type_kind == DOUBLE || ck.type_kind == FLOAT)) { // Pass-through strm.lengths[strm_type] = ck.num_rows * ck.dtype_len; - } else if (ck.type_kind == DECIMAL && strm_type == gpu::CI_DATA) { + } else if (ck.type_kind == DECIMAL && strm_type == CI_DATA) { strm.lengths[strm_type] = dec_chunk_sizes.rg_sizes.at(col_idx)[rg_idx]; } else { strm.lengths[strm_type] = RLE_stream_size(streams.type(strm_id), ck.num_rows); @@ -974,12 +971,12 @@ encoded_data encode_columns(orc_table_view const& orc_table, auto const& ck = chunks[col_idx][rg_idx]; auto& strm = col_streams[rg_idx]; - if (strm_id < 0 or (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 && + if (strm_id < 0 or (strm_type == CI_DATA && streams[strm_id].length == 0 && (ck.type_kind == DOUBLE || ck.type_kind == FLOAT))) { strm.data_ptrs[strm_type] = nullptr; } else { - if ((strm_type == gpu::CI_DICTIONARY) || - (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) { + if ((strm_type == CI_DICTIONARY) || + (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) { strm.data_ptrs[strm_type] = encoded_data[stripe.id][strm_id].data(); } else { strm.data_ptrs[strm_type] = (rg_idx_it == stripe.cbegin()) @@ -1003,16 +1000,16 @@ encoded_data encode_columns(orc_table_view const& orc_table, if (orc_table.num_rows() > 0) { if (orc_table.num_string_columns() != 0) { auto d_stripe_dict = orc_table.string_column(0).device_stripe_dicts(); - gpu::EncodeStripeDictionaries(d_stripe_dict.data(), - orc_table.d_columns, - chunks, - orc_table.num_string_columns(), - segmentation.num_stripes(), - chunk_streams, - stream); + EncodeStripeDictionaries(d_stripe_dict.data(), + orc_table.d_columns, + chunks, + orc_table.num_string_columns(), + segmentation.num_stripes(), + chunk_streams, + stream); } - gpu::EncodeOrcColumnData(chunks, chunk_streams, stream); + EncodeOrcColumnData(chunks, chunk_streams, stream); } chunk_streams.device_to_host_sync(stream); @@ -1034,7 +1031,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, std::vector gather_stripes(size_t num_index_streams, file_segmentation const& segmentation, encoded_data* enc_data, - hostdevice_2dvector* strm_desc, + hostdevice_2dvector* strm_desc, rmm::cuda_stream_view stream) { if (segmentation.num_stripes() == 0) { return {}; } @@ -1051,7 +1048,7 @@ std::vector gather_stripes(size_t num_index_streams, for (size_t col_idx = 0; col_idx < enc_data->streams.size().first; col_idx++) { auto const& col_streams = (enc_data->streams)[col_idx]; // Assign stream data of column data stream(s) - for (int k = 0; k < gpu::CI_INDEX; k++) { + for (int k = 0; k < CI_INDEX; k++) { auto const stream_id = col_streams[0].ids[k]; if (stream_id != -1) { auto const actual_stripe_size = std::accumulate( @@ -1091,7 +1088,7 @@ std::vector gather_stripes(size_t num_index_streams, strm_desc->host_to_device_async(stream); // TODO: use cub::DeviceMemcpy::Batched - gpu::CompactOrcDataStreams(*strm_desc, enc_data->streams, stream); + CompactOrcDataStreams(*strm_desc, enc_data->streams, stream); strm_desc->device_to_host_async(stream); enc_data->streams.device_to_host_sync(stream); @@ -1123,17 +1120,17 @@ cudf::detail::hostdevice_vector allocate_and_encode_blobs( rmm::cuda_stream_view stream) { // figure out the buffer size needed for protobuf format - gpu::orc_init_statistics_buffersize( + orc_init_statistics_buffersize( stats_merge_groups.device_ptr(), stat_chunks.data(), num_stat_blobs, stream); auto max_blobs = stats_merge_groups.element(num_stat_blobs - 1, stream); cudf::detail::hostdevice_vector blobs(max_blobs.start_chunk + max_blobs.num_chunks, stream); - gpu::orc_encode_statistics(blobs.device_ptr(), - stats_merge_groups.device_ptr(), - stat_chunks.data(), - num_stat_blobs, - stream); + orc_encode_statistics(blobs.device_ptr(), + stats_merge_groups.device_ptr(), + stat_chunks.data(), + num_stat_blobs, + stream); stats_merge_groups.device_to_host_async(stream); blobs.device_to_host_sync(stream); return blobs; @@ -1238,7 +1235,7 @@ intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq, auto stripe_stat_chunks = stripe_chunks.data(); rmm::device_uvector rowgroup_groups(num_rowgroup_blobs, stream); - gpu::orc_init_statistics_groups( + orc_init_statistics_groups( rowgroup_groups.data(), stat_desc.device_ptr(), segmentation.rowgroups, stream); detail::calculate_group_statistics( @@ -1440,8 +1437,8 @@ void write_index_stream(int32_t stripe_id, int32_t stream_id, host_span columns, file_segmentation const& segmentation, - host_2dspan enc_streams, - host_2dspan strm_desc, + host_2dspan enc_streams, + host_2dspan strm_desc, host_span comp_res, host_span rg_stats, StripeInformation* stripe, @@ -1455,8 +1452,7 @@ void write_index_stream(int32_t stripe_id, row_group_index_info data2; auto const column_id = stream_id - 1; - auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const& stream, - gpu::StreamIndexType type) { + auto find_record = [=, &strm_desc](encoder_chunk_streams const& stream, StreamIndexType type) { row_group_index_info record; if (stream.ids[type] > 0) { record.pos = 0; @@ -1469,8 +1465,8 @@ void write_index_stream(int32_t stripe_id, } return record; }; - auto scan_record = [=, &comp_res](gpu::encoder_chunk_streams const& stream, - gpu::StreamIndexType type, + auto scan_record = [=, &comp_res](encoder_chunk_streams const& stream, + StreamIndexType type, row_group_index_info& record) { if (record.pos >= 0) { record.pos += stream.lengths[type]; @@ -1489,9 +1485,9 @@ void write_index_stream(int32_t stripe_id, // TBD: Not sure we need an empty index stream for column 0 if (stream_id != 0) { auto const& strm = enc_streams[column_id][0]; - present = find_record(strm, gpu::CI_PRESENT); - data = find_record(strm, gpu::CI_DATA); - data2 = find_record(strm, gpu::CI_DATA2); + present = find_record(strm, CI_PRESENT); + data = find_record(strm, CI_DATA); + data2 = find_record(strm, CI_DATA2); // Change string dictionary to int from index point of view kind = columns[column_id].orc_kind(); @@ -1518,9 +1514,9 @@ void write_index_stream(int32_t stripe_id, if (stream_id != 0) { const auto& strm = enc_streams[column_id][rowgroup]; - scan_record(strm, gpu::CI_PRESENT, present); - scan_record(strm, gpu::CI_DATA, data); - scan_record(strm, gpu::CI_DATA2, data2); + scan_record(strm, CI_PRESENT, present); + scan_record(strm, CI_DATA, data); + scan_record(strm, CI_DATA2, data2); } }); @@ -1549,8 +1545,8 @@ void write_index_stream(int32_t stripe_id, * @param[in] stream CUDA stream used for device memory operations and kernel launches * @return An std::future that should be synchronized to ensure the writing is complete */ -std::future write_data_stream(gpu::StripeStream const& strm_desc, - gpu::encoder_chunk_streams const& enc_stream, +std::future write_data_stream(StripeStream const& strm_desc, + encoder_chunk_streams const& enc_stream, uint8_t const* compressed_data, host_span bounce_buffer, StripeInformation* stripe, @@ -1944,7 +1940,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, if (elem_sizes.empty()) return {}; // Compute element offsets within each row group - gpu::decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream); + decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream); // Gather the row group sizes and copy to host auto d_tmp_rowgroup_sizes = rmm::device_uvector(segmentation.num_rowgroups(), stream); @@ -2011,11 +2007,11 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table, auto counts = rmm::device_uvector(num_str_cols * num_rowgroups, stream); auto counts_2d_view = device_2dspan(counts, num_rowgroups); - gpu::rowgroup_char_counts(counts_2d_view, - orc_table.d_columns, - rowgroup_bounds, - orc_table.d_string_column_indices, - stream); + rowgroup_char_counts(counts_2d_view, + orc_table.d_columns, + rowgroup_bounds, + orc_table.d_string_column_indices, + stream); auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream); @@ -2030,7 +2026,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table, // Holds the stripe dictionary descriptors and dictionary buffers. struct stripe_dictionaries { - hostdevice_2dvector views; // descriptors [string_column][stripe] + hostdevice_2dvector views; // descriptors [string_column][stripe] std::vector> data_owner; // dictionary data owner, per stripe std::vector> index_owner; // dictionary index owner, per stripe std::vector> order_owner; // dictionary order owner, per stripe @@ -2082,17 +2078,17 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end - segmentation.rowgroups[stripe.first][col_idx].begin; hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); - total_map_storage_size += stripe_num_rows * gpu::occupancy_factor; + total_map_storage_size += stripe_num_rows * occupancy_factor; } hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); } - hostdevice_2dvector stripe_dicts( + hostdevice_2dvector stripe_dicts( orc_table.num_string_columns(), segmentation.num_stripes(), stream); if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}}; // Create a single bulk storage to use for all sub-dictionaries - auto map_storage = std::make_unique( + auto map_storage = std::make_unique( total_map_storage_size, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}); @@ -2121,8 +2117,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, } stripe_dicts.host_to_device_async(stream); - map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()}); - gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream); + map_storage->initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()}); + populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream); // Copy the entry counts and char counts from the device to the host stripe_dicts.device_to_host_sync(stream); @@ -2169,8 +2165,8 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, // Synchronize to ensure the copy is complete before we clear `map_slots` stripe_dicts.host_to_device_sync(stream); - gpu::collect_map_entries(stripe_dicts, stream); - gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); + collect_map_entries(stripe_dicts, stream); + get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); // deallocate hash map storage, unused after this point map_storage.reset(); @@ -2299,7 +2295,7 @@ auto convert_table_to_orc_data(table_view const& input, // Assemble individual disparate column chunks into contiguous data streams size_type const num_index_streams = (orc_table.num_columns() + 1); auto const num_data_streams = streams.size() - num_index_streams; - hostdevice_2dvector strm_descs( + hostdevice_2dvector strm_descs( segmentation.num_stripes(), num_data_streams, stream); auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data, &strm_descs, stream); @@ -2353,17 +2349,17 @@ auto convert_table_to_orc_data(table_view const& input, compression_result{0, compression_status::FAILURE}); if (compression != compression_type::NONE) { strm_descs.host_to_device_async(stream); - compression_stats = gpu::CompressOrcDataStreams(compressed_data, - num_compressed_blocks, - compression, - compression_blocksize, - max_compressed_block_size, - block_align, - collect_compression_stats, - strm_descs, - enc_data.streams, - comp_results, - stream); + compression_stats = CompressOrcDataStreams(compressed_data, + num_compressed_blocks, + compression, + compression_blocksize, + max_compressed_block_size, + block_align, + collect_compression_stats, + strm_descs, + enc_data.streams, + comp_results, + stream); // deallocate encoded data as it is not needed anymore enc_data.data.clear(); @@ -2535,7 +2531,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, orc_table_view const& orc_table, device_span compressed_data, host_span comp_results, - host_2dspan strm_descs, + host_2dspan strm_descs, host_span rg_stats, orc_streams& streams, host_span stripes, diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 7d23482cb17..b6a27d5a6c5 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -136,7 +136,7 @@ struct file_segmentation { */ struct encoded_data { std::vector>> data; // Owning array of the encoded data - hostdevice_2dvector streams; // streams of encoded data, per chunk + hostdevice_2dvector streams; // streams of encoded data, per chunk }; /** @@ -309,7 +309,7 @@ class writer::impl { orc_table_view const& orc_table, device_span compressed_data, host_span comp_results, - host_2dspan strm_descs, + host_2dspan strm_descs, host_span rg_stats, orc_streams& streams, host_span stripes, diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index af524e1f70a..a883981a467 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -599,9 +599,11 @@ std::vector aggregate_reader_metadata::get_parquet_types( return parquet_types; } -std::optional>> aggregate_reader_metadata::apply_bloom_filters( +std::pair>>, bool> +aggregate_reader_metadata::apply_bloom_filters( host_span const> sources, host_span const> input_row_group_indices, + size_type total_row_groups, host_span output_dtypes, host_span output_column_schemas, std::reference_wrapper filter, @@ -610,17 +612,6 @@ std::optional>> aggregate_reader_metadata::ap // Number of input table columns auto const num_input_columns = static_cast(output_dtypes.size()); - // Total number of row groups after StatsAST filtration - auto const total_row_groups = std::accumulate( - input_row_group_indices.begin(), - input_row_group_indices.end(), - size_t{0}, - [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); }); - - // Check if we have less than 2B total row groups. - CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), - "Total number of row groups exceed the size_type's limit"); - // Collect equality literals for each input table column auto const equality_literals = equality_literals_collector{filter.get(), num_input_columns}.get_equality_literals(); @@ -635,7 +626,7 @@ std::optional>> aggregate_reader_metadata::ap [](auto& eq_literals) { return not eq_literals.empty(); }); // Return early if no column with equality predicate(s) - if (equality_col_schemas.empty()) { return std::nullopt; } + if (equality_col_schemas.empty()) { return {std::nullopt, false}; } // Required alignment: // https://github.com/NVIDIA/cuCollections/blob/deab5799f3e4226cb8a49acf2199c03b14941ee4/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh#L55-L67 @@ -654,8 +645,8 @@ std::optional>> aggregate_reader_metadata::ap auto bloom_filter_data = read_bloom_filters( sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream, aligned_mr); - // No bloom filter buffers, return the original row group indices - if (bloom_filter_data.empty()) { return std::nullopt; } + // No bloom filter buffers, return early + if (bloom_filter_data.empty()) { return {std::nullopt, false}; } // Get parquet types for the predicate columns auto const parquet_types = get_parquet_types(input_row_group_indices, equality_col_schemas); @@ -676,8 +667,10 @@ std::optional>> aggregate_reader_metadata::ap h_bloom_filter_spans, stream, cudf::get_current_device_resource_ref()); // Create a bloom filter query table caster - bloom_filter_caster const bloom_filter_col{ - bloom_filter_spans, parquet_types, total_row_groups, equality_col_schemas.size()}; + bloom_filter_caster const bloom_filter_col{bloom_filter_spans, + parquet_types, + static_cast(total_row_groups), + equality_col_schemas.size()}; // Converts bloom filter membership for equality predicate columns to a table // containing a column for each `col[i] == literal` predicate to be evaluated. @@ -714,10 +707,11 @@ std::optional>> aggregate_reader_metadata::ap // Filter bloom filter membership table with the BloomfilterAST expression and collect // filtered row group indices - return collect_filtered_row_group_indices(bloom_filter_membership_table, - bloom_filter_expr.get_bloom_filter_expr(), - input_row_group_indices, - stream); + return {collect_filtered_row_group_indices(bloom_filter_membership_table, + bloom_filter_expr.get_bloom_filter_expr(), + input_row_group_indices, + stream), + true}; } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 0e307bac097..1508b7eef8b 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -388,40 +388,17 @@ class stats_expression_converter : public ast::detail::expression_transformer { }; } // namespace -std::optional>> aggregate_reader_metadata::filter_row_groups( +std::pair>>, surviving_row_group_metrics> +aggregate_reader_metadata::filter_row_groups( host_span const> sources, - host_span const> row_group_indices, + host_span const> input_row_group_indices, + size_type total_row_groups, host_span output_dtypes, host_span output_column_schemas, std::reference_wrapper filter, rmm::cuda_stream_view stream) const { auto mr = cudf::get_current_device_resource_ref(); - // Create row group indices. - std::vector> all_row_group_indices; - host_span const> input_row_group_indices; - if (row_group_indices.empty()) { - std::transform(per_file_metadata.cbegin(), - per_file_metadata.cend(), - std::back_inserter(all_row_group_indices), - [](auto const& file_meta) { - std::vector rg_idx(file_meta.row_groups.size()); - std::iota(rg_idx.begin(), rg_idx.end(), 0); - return rg_idx; - }); - input_row_group_indices = host_span const>(all_row_group_indices); - } else { - input_row_group_indices = row_group_indices; - } - auto const total_row_groups = std::accumulate( - input_row_group_indices.begin(), - input_row_group_indices.end(), - size_t{0}, - [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); }); - - // Check if we have less than 2B total row groups. - CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), - "Total number of row groups exceed the size_type's limit"); // Converts Column chunk statistics to a table // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] @@ -451,16 +428,22 @@ std::optional>> aggregate_reader_metadata::fi // Converts AST to StatsAST with reference to min, max columns in above `stats_table`. stats_expression_converter const stats_expr{filter.get(), static_cast(output_dtypes.size())}; - auto stats_ast = stats_expr.get_stats_expr(); - auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr); - auto predicate = predicate_col->view(); - CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8, - "Filter expression must return a boolean column"); // Filter stats table with StatsAST expression and collect filtered row group indices auto const filtered_row_group_indices = collect_filtered_row_group_indices( stats_table, stats_expr.get_stats_expr(), input_row_group_indices, stream); + // Number of surviving row groups after applying stats filter + auto const num_stats_filtered_row_groups = + filtered_row_group_indices.has_value() + ? std::accumulate(filtered_row_group_indices.value().cbegin(), + filtered_row_group_indices.value().cend(), + size_type{0}, + [](auto& sum, auto const& per_file_row_groups) { + return sum + per_file_row_groups.size(); + }) + : total_row_groups; + // Span of row groups to apply bloom filtering on. auto const bloom_filter_input_row_groups = filtered_row_group_indices.has_value() @@ -468,12 +451,32 @@ std::optional>> aggregate_reader_metadata::fi : input_row_group_indices; // Apply bloom filtering on the bloom filter input row groups - auto const bloom_filtered_row_groups = apply_bloom_filters( - sources, bloom_filter_input_row_groups, output_dtypes, output_column_schemas, filter, stream); + auto const [bloom_filtered_row_groups, bloom_filters_exist] = + apply_bloom_filters(sources, + bloom_filter_input_row_groups, + num_stats_filtered_row_groups, + output_dtypes, + output_column_schemas, + filter, + stream); + + // Number of surviving row groups after applying bloom filter + auto const num_bloom_filtered_row_groups = + bloom_filters_exist + ? (bloom_filtered_row_groups.has_value() + ? std::make_optional(std::accumulate(bloom_filtered_row_groups.value().cbegin(), + bloom_filtered_row_groups.value().cend(), + size_type{0}, + [](auto& sum, auto const& per_file_row_groups) { + return sum + per_file_row_groups.size(); + })) + : std::make_optional(num_stats_filtered_row_groups)) + : std::nullopt; // Return bloom filtered row group indices iff collected - return bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups - : filtered_row_group_indices; + return { + bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups : filtered_row_group_indices, + {std::make_optional(num_stats_filtered_row_groups), num_bloom_filtered_row_groups}}; } // convert column named expression to column index reference expression diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 9dd4e19de52..87e358e89f8 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -610,6 +610,17 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode) auto out_columns = std::vector>{}; out_columns.reserve(_output_buffers.size()); + // Copy number of total input row groups and number of surviving row groups from predicate + // pushdown. + out_metadata.num_input_row_groups = _file_itm_data.num_input_row_groups; + // Copy the number surviving row groups from each predicate pushdown only if the filter has value. + if (_expr_conv.get_converted_expr().has_value()) { + out_metadata.num_row_groups_after_stats_filter = + _file_itm_data.surviving_row_groups.after_stats_filter; + out_metadata.num_row_groups_after_bloom_filter = + _file_itm_data.surviving_row_groups.after_bloom_filter; + } + // no work to do (this can happen on the first pass if we have no rows to read) if (!has_more_work()) { // Check if number of rows per source should be included in output metadata. diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index 4a773fbced1..294eaf9ac16 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -47,6 +47,11 @@ struct file_intermediate_data { // partial sum of the number of rows per data source std::vector exclusive_sum_num_rows_per_source{}; + size_type num_input_row_groups{0}; // total number of input row groups across all data sources + + // struct containing the number of remaining row groups after each predicate pushdown filter + surviving_row_group_metrics surviving_row_groups; + size_t _current_input_pass{0}; // current input pass index size_t _output_chunk_count{0}; // how many output chunks we have produced diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 7d3b6a39d5b..768ca384352 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -408,10 +408,16 @@ int64_t aggregate_reader_metadata::calc_num_rows() const size_type aggregate_reader_metadata::calc_num_row_groups() const { - return std::accumulate( - per_file_metadata.cbegin(), per_file_metadata.cend(), 0, [](auto& sum, auto& pfm) { + auto const total_row_groups = std::accumulate( + per_file_metadata.cbegin(), per_file_metadata.cend(), size_t{0}, [](size_t& sum, auto& pfm) { return sum + pfm.row_groups.size(); }); + + // Check if we have less than 2B total row groups. + CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), + "Total number of row groups exceed the size_type's limit"); + + return static_cast(total_row_groups); } // Copies info from the column and offset indexes into the passed in row_group_info. @@ -1029,7 +1035,12 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con return names; } -std::tuple, std::vector> +std::tuple, + std::vector, + size_type, + surviving_row_group_metrics> aggregate_reader_metadata::select_row_groups( host_span const> sources, host_span const> row_group_indices, @@ -1040,17 +1051,63 @@ aggregate_reader_metadata::select_row_groups( std::optional> filter, rmm::cuda_stream_view stream) const { + // Compute total number of input row groups + size_type total_row_groups = [&]() { + if (not row_group_indices.empty()) { + size_t const total_row_groups = + std::accumulate(row_group_indices.begin(), + row_group_indices.end(), + size_t{0}, + [](size_t& sum, auto const& pfm) { return sum + pfm.size(); }); + + // Check if we have less than 2B total row groups. + CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), + "Total number of row groups exceed the size_type's limit"); + return static_cast(total_row_groups); + } else { + return num_row_groups; + } + }(); + + // Pair to store the number of row groups after stats and bloom filtering respectively. Initialize + // to total_row_groups. + surviving_row_group_metrics num_row_groups_after_filters{}; + std::optional>> filtered_row_group_indices; // if filter is not empty, then gather row groups to read after predicate pushdown if (filter.has_value()) { - filtered_row_group_indices = filter_row_groups( - sources, row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream); + // Span of input row group indices for predicate pushdown + host_span const> input_row_group_indices; + std::vector> all_row_group_indices; + if (row_group_indices.empty()) { + std::transform(per_file_metadata.cbegin(), + per_file_metadata.cend(), + std::back_inserter(all_row_group_indices), + [](auto const& file_meta) { + std::vector rg_idx(file_meta.row_groups.size()); + std::iota(rg_idx.begin(), rg_idx.end(), 0); + return rg_idx; + }); + input_row_group_indices = host_span const>(all_row_group_indices); + } else { + input_row_group_indices = row_group_indices; + } + // Predicate pushdown: Filter row groups using stats and bloom filters + std::tie(filtered_row_group_indices, num_row_groups_after_filters) = + filter_row_groups(sources, + input_row_group_indices, + total_row_groups, + output_dtypes, + output_column_schemas, + filter.value(), + stream); if (filtered_row_group_indices.has_value()) { row_group_indices = host_span const>(filtered_row_group_indices.value()); } } - std::vector selection; + + // Compute the number of rows to read and skip auto [rows_to_skip, rows_to_read] = [&]() { if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( @@ -1061,7 +1118,9 @@ aggregate_reader_metadata::select_row_groups( static_cast(from_opts.second)}; }(); - // Get number of rows in each data source + // Vector to hold the `row_group_info` of selected row groups + std::vector selection; + // Number of rows in each data source std::vector num_rows_per_source(per_file_metadata.size(), 0); if (!row_group_indices.empty()) { @@ -1083,6 +1142,10 @@ aggregate_reader_metadata::select_row_groups( } } } else { + // Reset and recompute input row group count to adjust for num_rows and skip_rows. Here, the + // output from predicate pushdown was empty. i.e., no row groups filtered. + total_row_groups = 0; + size_type count = 0; for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) { auto const& fmd = per_file_metadata[src_idx]; @@ -1093,6 +1156,9 @@ aggregate_reader_metadata::select_row_groups( auto const chunk_start_row = count; count += rg.num_rows; if (count > rows_to_skip || count == 0) { + // Keep this row group, increase count + total_row_groups++; + // start row of this row group adjusted with rows_to_skip num_rows_per_source[src_idx] += count; num_rows_per_source[src_idx] -= @@ -1113,9 +1179,24 @@ aggregate_reader_metadata::select_row_groups( } } } + + // If filter had a value and no row groups were filtered, set the number of row groups after + // filters to the number of adjusted input row groups + auto const after_stats_filter = num_row_groups_after_filters.after_stats_filter.has_value() + ? std::make_optional(total_row_groups) + : std::nullopt; + auto const after_bloom_filter = num_row_groups_after_filters.after_bloom_filter.has_value() + ? std::make_optional(total_row_groups) + : std::nullopt; + num_row_groups_after_filters = {after_stats_filter, after_bloom_filter}; } - return {rows_to_skip, rows_to_read, std::move(selection), std::move(num_rows_per_source)}; + return {rows_to_skip, + rows_to_read, + std::move(selection), + std::move(num_rows_per_source), + total_row_groups, + std::move(num_row_groups_after_filters)}; } std::tuple, diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index ba5e53e3104..c4372b2c1ff 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -125,6 +125,14 @@ struct arrow_schema_data_types { data_type type{type_id::EMPTY}; }; +/** + * @brief Struct to store the number of row groups surviving each predicate pushdown filter. + */ +struct surviving_row_group_metrics { + std::optional after_stats_filter; // number of surviving row groups after stats filter + std::optional after_bloom_filter; // number of surviving row groups after bloom filter +}; + class aggregate_reader_metadata { std::vector per_file_metadata; std::vector> keyval_maps; @@ -358,40 +366,47 @@ class aggregate_reader_metadata { * @brief Filters the row groups based on predicate filter * * @param sources Lists of input datasources - * @param row_group_indices Lists of row groups to read, one per source + * @param input_row_group_indices Lists of input row groups, one per source + * @param total_row_groups Total number of row groups in `input_row_group_indices` * @param output_dtypes Datatypes of output columns * @param output_column_schemas schema indices of output columns * @param filter AST expression to filter row groups based on Column chunk statistics * @param stream CUDA stream used for device memory operations and kernel launches - * @return Filtered row group indices, if any is filtered + * @return A pair of a list of filtered row group indices if any are filtered, and a struct + * containing the number of row groups surviving each predicate pushdown filter */ - [[nodiscard]] std::optional>> filter_row_groups( - host_span const> sources, - host_span const> row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::reference_wrapper filter, - rmm::cuda_stream_view stream) const; + [[nodiscard]] std::pair>>, + surviving_row_group_metrics> + filter_row_groups(host_span const> sources, + host_span const> input_row_group_indices, + size_type total_row_groups, + host_span output_dtypes, + host_span output_column_schemas, + std::reference_wrapper filter, + rmm::cuda_stream_view stream) const; /** * @brief Filters the row groups using bloom filters * * @param sources Dataset sources - * @param row_group_indices Lists of input row groups to read, one per source + * @param input_row_group_indices Lists of input row groups, one per source + * @param total_row_groups Total number of row groups in `input_row_group_indices` * @param output_dtypes Datatypes of output columns * @param output_column_schemas schema indices of output columns * @param filter AST expression to filter row groups based on bloom filter membership * @param stream CUDA stream used for device memory operations and kernel launches * - * @return Filtered row group indices, if any is filtered + * @return A pair of filtered row group indices if any is filtered, and a boolean indicating if + * bloom filtering was applied */ - [[nodiscard]] std::optional>> apply_bloom_filters( - host_span const> sources, - host_span const> input_row_group_indices, - host_span output_dtypes, - host_span output_column_schemas, - std::reference_wrapper filter, - rmm::cuda_stream_view stream) const; + [[nodiscard]] std::pair>>, bool> + apply_bloom_filters(host_span const> sources, + host_span const> input_row_group_indices, + size_type total_row_groups, + host_span output_dtypes, + host_span output_column_schemas, + std::reference_wrapper filter, + rmm::cuda_stream_view stream) const; /** * @brief Filters and reduces down to a selection of row groups @@ -408,9 +423,15 @@ class aggregate_reader_metadata { * @param filter Optional AST expression to filter row groups based on Column chunk statistics * @param stream CUDA stream used for device memory operations and kernel launches * @return A tuple of corrected row_start, row_count, list of row group indexes and its - * starting row, and list of number of rows per source + * starting row, list of number of rows per source, number of input row groups, and a + * struct containing the number of row groups surviving each predicate pushdown filter */ - [[nodiscard]] std::tuple, std::vector> + [[nodiscard]] std::tuple, + std::vector, + size_type, + surviving_row_group_metrics> select_row_groups(host_span const> sources, host_span const> row_group_indices, int64_t row_start, diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 3874346e471..b6134947b0c 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -1285,7 +1285,9 @@ void reader::impl::preprocess_file(read_mode mode) std::tie(_file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups, - _file_itm_data.num_rows_per_source) = + _file_itm_data.num_rows_per_source, + _file_itm_data.num_input_row_groups, + _file_itm_data.surviving_row_groups) = _metadata->select_row_groups(_sources, _options.row_group_indices, _options.skip_rows, diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index 1d506c59cd9..4e96f900bf3 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -234,6 +234,16 @@ struct DeviceNot { } }; +// negation + +struct DeviceNegate { + template + T __device__ operator()(T data) + { + return -data; + } +}; + // fixed_point ops /* @@ -278,6 +288,12 @@ struct fixed_point_abs { __device__ T operator()(T data) { return numeric::detail::abs(data); } }; +template +struct fixed_point_negate { + T n; + __device__ T operator()(T data) { return -data; } +}; + template typename FixedPointFunctor> std::unique_ptr unary_op_with(column_view const& input, rmm::cuda_stream_view stream, @@ -414,6 +430,34 @@ struct MathOpDispatcher { } }; +template +struct NegateOpDispatcher { + template + static constexpr bool is_supported() + { + return std::is_signed_v || cudf::is_duration(); + } + + template ()>* = nullptr> + std::unique_ptr operator()(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + return transform_fn(input.begin(), + input.end(), + cudf::detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); + } + + template + std::enable_if_t(), std::unique_ptr> operator()(Args&&...) + { + CUDF_FAIL("Unsupported data type for negate operation"); + } +}; + template struct BitwiseOpDispatcher { template >* = nullptr> @@ -550,9 +594,10 @@ struct FixedPointOpDispatcher { { // clang-format off switch (op) { - case cudf::unary_operator::CEIL: return unary_op_with(input, stream, mr); - case cudf::unary_operator::FLOOR: return unary_op_with(input, stream, mr); - case cudf::unary_operator::ABS: return unary_op_with(input, stream, mr); + case cudf::unary_operator::CEIL: return unary_op_with(input, stream, mr); + case cudf::unary_operator::FLOOR: return unary_op_with(input, stream, mr); + case cudf::unary_operator::ABS: return unary_op_with(input, stream, mr); + case cudf::unary_operator::NEGATE: return unary_op_with(input, stream, mr); default: CUDF_FAIL("Unsupported fixed_point unary operation"); } // clang-format on @@ -639,6 +684,9 @@ std::unique_ptr unary_operation(cudf::column_view const& input, case cudf::unary_operator::NOT: return cudf::type_dispatcher( input.type(), detail::LogicalOpDispatcher{}, input, stream, mr); + case cudf::unary_operator::NEGATE: + return cudf::type_dispatcher( + input.type(), detail::NegateOpDispatcher{}, input, stream, mr); default: CUDF_FAIL("Undefined unary operation"); } } diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index 177e6163d4f..b96c423917a 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1328,6 +1328,26 @@ TEST_F(ParquetReaderTest, ReorderedReadMultipleFiles) CUDF_TEST_EXPECT_TABLES_EQUAL(sliced[1], swapped2); } +TEST_F(ParquetReaderTest, NoFilter) +{ + srand(31337); + auto expected = create_random_fixed_table(9, 9, false); + + auto filepath = temp_env->get_temp_filepath("FilterSimple.parquet"); + cudf::io::parquet_writer_options args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected); + cudf::io::write_parquet(args); + + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(read_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected); + EXPECT_EQ(result.metadata.num_input_row_groups, 1); + EXPECT_FALSE(result.metadata.num_row_groups_after_stats_filter.has_value()); + EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value()); +} + TEST_F(ParquetReaderTest, FilterSimple) { srand(31337); @@ -2681,52 +2701,107 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) auto const [src, filepath] = create_parquet_typed_with_stats("FilterTyped.parquet"); auto const written_table = src.view(); + auto const col_name_0 = cudf::ast::column_name_reference("col0"); + auto const col_ref_0 = cudf::ast::column_reference(0); - // Filtering AST - auto literal_value = []() { - if constexpr (cudf::is_timestamp()) { - // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds - return cudf::timestamp_scalar(T(typename T::duration(10000))); // i (0-20,000) - } else if constexpr (cudf::is_duration()) { - // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds - return cudf::duration_scalar(T(10000)); // i (0-20,000) - } else if constexpr (std::is_same_v) { - // table[0] < "000010000" - return cudf::string_scalar("000010000"); // i (0-20,000) + auto const test_predicate_pushdown = [&](cudf::ast::operation const& filter_expression, + cudf::ast::operation const& ref_filter, + cudf::size_type expected_total_row_groups, + cudf::size_type expected_stats_filtered_row_groups) { + // Expected result + auto const predicate = cudf::compute_column(written_table, ref_filter); + EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) + << "Predicate filter should return a boolean"; + auto const expected = cudf::apply_boolean_mask(written_table, *predicate); + + // Reading with Predicate Pushdown + cudf::io::parquet_reader_options read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) + .filter(filter_expression); + auto const result = cudf::io::read_parquet(read_opts); + auto const result_table = result.tbl->view(); + + // Tests + EXPECT_EQ(static_cast(written_table.column(0).type().id()), + static_cast(result_table.column(0).type().id())) + << "col0 type mismatch"; + + // To make sure AST filters out some elements if row groups must be filtered + if (expected_stats_filtered_row_groups < expected_total_row_groups) { + EXPECT_LT(expected->num_rows(), written_table.num_rows()); } else { - // table[0] < 0 or 100u - return cudf::numeric_scalar((100 - 100 * std::is_signed_v)); // i/100 (-100-100/ 0-200) + EXPECT_LE(expected->num_rows(), written_table.num_rows()); } - }(); - auto literal = cudf::ast::literal(literal_value); - auto col_name_0 = cudf::ast::column_name_reference("col0"); - auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); - auto col_ref_0 = cudf::ast::column_reference(0); - auto ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); - - // Expected result - auto predicate = cudf::compute_column(written_table, ref_filter); - EXPECT_EQ(predicate->view().type().id(), cudf::type_id::BOOL8) - << "Predicate filter should return a boolean"; - auto expected = cudf::apply_boolean_mask(written_table, *predicate); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); + EXPECT_EQ(result.metadata.num_input_row_groups, expected_total_row_groups); + EXPECT_TRUE(result.metadata.num_row_groups_after_stats_filter.has_value()); + EXPECT_EQ(result.metadata.num_row_groups_after_stats_filter.value(), + expected_stats_filtered_row_groups); + EXPECT_FALSE(result.metadata.num_row_groups_after_bloom_filter.has_value()); + }; - // Reading with Predicate Pushdown - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}) - .filter(filter_expression); - auto result = cudf::io::read_parquet(read_opts); - auto result_table = result.tbl->view(); + // The `literal_value` and stats should filter out 2 out of 4 row groups. + { + auto constexpr expected_total_row_groups = 4; + auto constexpr expected_stats_filtered_row_groups = 2; + + // Filtering AST + auto literal_value = []() { + if constexpr (cudf::is_timestamp()) { + // table[0] < 10000 timestamp days/seconds/milliseconds/microseconds/nanoseconds + return cudf::timestamp_scalar(T(typename T::duration(10000))); // i (0-20,000) + } else if constexpr (cudf::is_duration()) { + // table[0] < 10000 day/seconds/milliseconds/microseconds/nanoseconds + return cudf::duration_scalar(T(10000)); // i (0-20,000) + } else if constexpr (std::is_same_v) { + // table[0] < "000010000" + return cudf::string_scalar("000010000"); // i (0-20,000) + } else { + // table[0] < 0 or 100u + return cudf::numeric_scalar( + (100 - 100 * std::is_signed_v)); // i/100 (-100-100/ 0-200) + } + }(); + + auto const literal = cudf::ast::literal(literal_value); + auto const filter_expression = + cudf::ast::operation(cudf::ast::ast_operator::LESS, col_name_0, literal); + auto const ref_filter = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + test_predicate_pushdown( + filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups); + } - // tests - EXPECT_EQ(int(written_table.column(0).type().id()), int(result_table.column(0).type().id())) - << "col0 type mismatch"; - // To make sure AST filters out some elements - EXPECT_LT(expected->num_rows(), written_table.num_rows()); - EXPECT_EQ(result_table.num_rows(), expected->num_rows()); - EXPECT_EQ(result_table.num_columns(), expected->num_columns()); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); + // The `literal_value` and stats should not filter any of the 4 row groups. + { + auto constexpr expected_total_row_groups = 4; + auto constexpr expected_stats_filtered_row_groups = 4; + + // Filtering AST + auto literal_value = []() { + if constexpr (cudf::is_timestamp()) { + return cudf::timestamp_scalar(T(typename T::duration(20000))); + } else if constexpr (cudf::is_duration()) { + return cudf::duration_scalar(T(20000)); + } else if constexpr (std::is_same_v) { + return cudf::string_scalar("000020000"); + } else { + return cudf::numeric_scalar(std::numeric_limits::max()); + } + }(); + + auto const literal = cudf::ast::literal(literal_value); + auto const filter_expression = + cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_name_0, literal); + auto const ref_filter = + cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref_0, literal); + test_predicate_pushdown( + filter_expression, ref_filter, expected_total_row_groups, expected_stats_filtered_row_groups); + } } +////////////////////// +// wide tables tests + // The test below requires several minutes to complete with memcheck, thus it is disabled by // default. TEST_F(ParquetReaderTest, DISABLED_ListsWideTable) diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp index 663a919f3f4..bcb84d4574c 100644 --- a/cpp/tests/unary/math_ops_test.cpp +++ b/cpp/tests/unary/math_ops_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,69 @@ #include +using TypesToNegate = cudf::test::Types; + +template +struct UnaryNegateTests : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(UnaryNegateTests, TypesToNegate); + +TYPED_TEST(UnaryNegateTests, SimpleNEGATE) +{ + using T = TypeParam; + cudf::test::fixed_width_column_wrapper input{{0, 1, 2, 3}}; + auto const v = cudf::test::make_type_param_vector({0, -1, -2, -3}); + cudf::test::fixed_width_column_wrapper expected(v.begin(), v.end()); + auto output = cudf::unary_operation(input, cudf::unary_operator::NEGATE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view()); +} + +using TypesNotToNegate = cudf::test::Types; + +template +struct UnaryNegateErrorTests : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(UnaryNegateErrorTests, TypesNotToNegate); + +TYPED_TEST(UnaryNegateErrorTests, UnsupportedTypesFail) +{ + using T = TypeParam; + cudf::test::fixed_width_column_wrapper input({1, 2, 3, 4}); + EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error); +} + +struct UnaryNegateComplexTypesErrorTests : public cudf::test::BaseFixture {}; + +TEST_F(UnaryNegateComplexTypesErrorTests, NegateStringColumnFail) +{ + cudf::test::strings_column_wrapper input({"foo", "bar"}); + EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error); +} + +TEST_F(UnaryNegateComplexTypesErrorTests, NegateListsColumnFail) +{ + cudf::test::lists_column_wrapper input{{1, 2}, {3, 4}}; + EXPECT_THROW(cudf::unary_operation(input, cudf::unary_operator::NEGATE), cudf::logic_error); +} + template struct UnaryLogicalOpsTest : public cudf::test::BaseFixture {}; @@ -274,7 +337,7 @@ TYPED_TEST(UnaryMathFloatOpsTest, SimpleTANH) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view()); } -TYPED_TEST(UnaryMathFloatOpsTest, SimpleiASINH) +TYPED_TEST(UnaryMathFloatOpsTest, SimpleASINH) { cudf::test::fixed_width_column_wrapper input{{0.0}}; cudf::test::fixed_width_column_wrapper expected{{0.0}}; diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index 3c616461c74..d7989c6b053 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -266,6 +266,20 @@ struct FixedPointUnaryTests : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(FixedPointUnaryTests, cudf::test::FixedPointTypes); +TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryNegate) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const input = fp_wrapper{{0, -1234, -3456, -6789, 1234, 3456, 6789}, scale_type{-3}}; + auto const expected = fp_wrapper{{0, 1234, 3456, 6789, -1234, -3456, -6789}, scale_type{-3}}; + auto const result = cudf::unary_operation(input, cudf::unary_operator::NEGATE); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryAbs) { using namespace numeric; diff --git a/dependencies.yaml b/dependencies.yaml index 501128d278e..b1378fae6d7 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -638,7 +638,7 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==25.2.*,>=0.0.0a0 + - dask-cuda==25.4.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -786,13 +786,13 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==25.2.*,>=0.0.0a0 + - rapids-dask-dependency==25.4.*,>=0.0.0a0 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] packages: - pynvml>=12.0.0,<13.0.0a0 - - rapids-dask-dependency==25.2.*,>=0.0.0a0 + - rapids-dask-dependency==25.4.*,>=0.0.0a0 run_custreamz: common: - output_types: conda @@ -930,7 +930,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==25.2.*,>=0.0.0a0 + - dask-cuda==25.4.*,>=0.0.0a0 - *numba-cuda-dep - *numba-dep specific: @@ -951,7 +951,7 @@ dependencies: common: - output_types: conda packages: - - &libcudf_unsuffixed libcudf==25.2.*,>=0.0.0a0 + - &libcudf_unsuffixed libcudf==25.4.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -965,18 +965,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libcudf-cu12==25.2.*,>=0.0.0a0 + - libcudf-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - libcudf-cu11==25.2.*,>=0.0.0a0 + - libcudf-cu11==25.4.*,>=0.0.0a0 - {matrix: null, packages: [*libcudf_unsuffixed]} depends_on_pylibcudf: common: - output_types: conda packages: - - &pylibcudf_unsuffixed pylibcudf==25.2.*,>=0.0.0a0 + - &pylibcudf_unsuffixed pylibcudf==25.4.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -990,18 +990,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibcudf-cu12==25.2.*,>=0.0.0a0 + - pylibcudf-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - pylibcudf-cu11==25.2.*,>=0.0.0a0 + - pylibcudf-cu11==25.4.*,>=0.0.0a0 - {matrix: null, packages: [*pylibcudf_unsuffixed]} depends_on_cudf: common: - output_types: conda packages: - - &cudf_unsuffixed cudf==25.2.*,>=0.0.0a0 + - &cudf_unsuffixed cudf==25.4.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -1015,18 +1015,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf-cu12==25.2.*,>=0.0.0a0 + - cudf-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf-cu11==25.2.*,>=0.0.0a0 + - cudf-cu11==25.4.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_unsuffixed]} depends_on_cudf_kafka: common: - output_types: conda packages: - - &cudf_kafka_unsuffixed cudf_kafka==25.2.*,>=0.0.0a0 + - &cudf_kafka_unsuffixed cudf_kafka==25.4.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -1040,12 +1040,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu12==25.2.*,>=0.0.0a0 + - cudf_kafka-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu11==25.2.*,>=0.0.0a0 + - cudf_kafka-cu11==25.4.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_kafka_unsuffixed]} depends_on_cupy: common: @@ -1066,7 +1066,7 @@ dependencies: common: - output_types: conda packages: - - &libkvikio_unsuffixed libkvikio==25.2.*,>=0.0.0a0 + - &libkvikio_unsuffixed libkvikio==25.4.*,>=0.0.0a0 - output_types: requirements packages: - --extra-index-url=https://pypi.nvidia.com @@ -1078,12 +1078,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libkvikio-cu12==25.2.*,>=0.0.0a0 + - libkvikio-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - libkvikio-cu11==25.2.*,>=0.0.0a0 + - libkvikio-cu11==25.4.*,>=0.0.0a0 - matrix: packages: - *libkvikio_unsuffixed @@ -1091,7 +1091,7 @@ dependencies: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==25.2.*,>=0.0.0a0 + - &librmm_unsuffixed librmm==25.4.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -1105,12 +1105,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==25.2.*,>=0.0.0a0 + - librmm-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - librmm-cu11==25.2.*,>=0.0.0a0 + - librmm-cu11==25.4.*,>=0.0.0a0 - matrix: packages: - *librmm_unsuffixed @@ -1118,7 +1118,7 @@ dependencies: common: - output_types: conda packages: - - &rmm_unsuffixed rmm==25.2.*,>=0.0.0a0 + - &rmm_unsuffixed rmm==25.4.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -1132,12 +1132,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - rmm-cu12==25.2.*,>=0.0.0a0 + - rmm-cu12==25.4.*,>=0.0.0a0 - matrix: cuda: "11.*" cuda_suffixed: "true" packages: - - rmm-cu11==25.2.*,>=0.0.0a0 + - rmm-cu11==25.4.*,>=0.0.0a0 - matrix: packages: - *rmm_unsuffixed diff --git a/java/ci/README.md b/java/ci/README.md index bfb35bc1d23..cc8ab77bf6c 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.02 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.04 ``` ### Build cuDF jar with devtoolset @@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-25.02.0-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-25.04.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index 8bbeac20c99..1f80381dd7e 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 25.02.0-SNAPSHOT + 25.04.0-SNAPSHOT cudfjni diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index d4ef09e44e8..e24cf72bab3 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from numba.np import numpy_support from typing_extensions import Self import pylibcudf as plc @@ -24,7 +23,6 @@ from cudf.core.mixins import BinaryOperand from cudf.core.scalar import pa_scalar_to_plc_scalar from cudf.errors import MixedTypeError -from cudf.utils import cudautils from cudf.utils.dtypes import ( find_common_type, min_column_type, @@ -33,7 +31,7 @@ ) if TYPE_CHECKING: - from collections.abc import Callable, Sequence + from collections.abc import Sequence from cudf._typing import ( ColumnBinaryOperand, @@ -45,13 +43,6 @@ from cudf.core.buffer import Buffer from cudf.core.column import DecimalBaseColumn -_unaryop_map = { - "ASIN": "ARCSIN", - "ACOS": "ARCCOS", - "ATAN": "ARCTAN", - "INVERT": "BIT_INVERT", -} - class NumericalColumn(NumericalBaseColumn): """ @@ -197,24 +188,6 @@ def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase: ) return type(self).from_pylibcudf(plc_column) - def unary_operator(self, unaryop: str | Callable) -> ColumnBase: - if callable(unaryop): - nb_type = numpy_support.from_dtype(self.dtype) - nb_signature = (nb_type,) - compiled_op = cudautils.compile_udf(unaryop, nb_signature) - np_dtype = np.dtype(compiled_op[1]) - return self.transform(compiled_op, np_dtype) - - unaryop = unaryop.upper() - unaryop = _unaryop_map.get(unaryop, unaryop) - unaryop = plc.unary.UnaryOperator[unaryop] - with acquire_spill_lock(): - return type(self).from_pylibcudf( - plc.unary.unary_operation( - self.to_pylibcudf(mode="read"), unaryop - ) - ) - def __invert__(self): if self.dtype.kind in "ui": return self.unary_operator("invert") diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index d8c316a4c8f..2674b92bb21 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Literal, cast import numpy as np +from numba.np import numpy_support import pylibcudf as plc @@ -14,12 +15,23 @@ from cudf.core.column.column import ColumnBase from cudf.core.missing import NA from cudf.core.mixins import Scannable +from cudf.utils import cudautils if TYPE_CHECKING: + from collections.abc import Callable + from cudf._typing import ScalarLike from cudf.core.column.decimal import DecimalDtype +_unaryop_map = { + "ASIN": "ARCSIN", + "ACOS": "ARCCOS", + "ATAN": "ARCTAN", + "INVERT": "BIT_INVERT", +} + + class NumericalBaseColumn(ColumnBase, Scannable): """ A column composed of numerical (bool, integer, float, decimal) data. @@ -268,3 +280,26 @@ def _scan(self, op: str) -> ColumnBase: return self.scan(op.replace("cum", ""), True)._with_type_metadata( self.dtype ) + + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: + if callable(unaryop): + nb_type = numpy_support.from_dtype(self.dtype) + nb_signature = (nb_type,) + compiled_op = cudautils.compile_udf(unaryop, nb_signature) + np_dtype = np.dtype(compiled_op[1]) + return self.transform(compiled_op, np_dtype) + + unaryop = unaryop.upper() + unaryop = _unaryop_map.get(unaryop, unaryop) + unaryop = plc.unary.UnaryOperator[unaryop] + with acquire_spill_lock(): + return type(self).from_pylibcudf( + plc.unary.unary_operation( + self.to_pylibcudf(mode="read"), unaryop + ) + ) + + def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase: + raise NotImplementedError( + "transform is not implemented for NumericalBaseColumn" + ) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 3f5aea19307..c7243d01325 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -41,6 +41,16 @@ } +@functools.cache +def get_np_td_unit_conversion( + reso: str, dtype: None | np.dtype +) -> np.timedelta64: + td = np.timedelta64(_unit_to_nanoseconds_conversion[reso], "ns") + if dtype is not None: + return td.astype(dtype) + return td + + class TimeDeltaColumn(ColumnBase): """ Parameters @@ -483,74 +493,33 @@ def components(self) -> dict[str, ColumnBase]: 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 """ - date_meta = { + "hours": ["D", "h"], + "minutes": ["h", "m"], "seconds": ["m", "s"], "milliseconds": ["s", "ms"], "microseconds": ["ms", "us"], "nanoseconds": ["us", "ns"], } - data = { - "days": self - // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) - ), - "hours": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) - ) - ) - // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ).astype(self.dtype) - ), - "minutes": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ).astype(self.dtype) - ) - ) - // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["m"], "ns" - ).astype(self.dtype) - ), - } - keys_list = iter(date_meta.keys()) - for name in keys_list: - value = date_meta[name] - data[name] = ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion[value[0]], "ns" - ).astype(self.dtype) + data = {"days": self // get_np_td_unit_conversion("D", self.dtype)} + reached_self_unit = False + for result_key, (mod_unit, div_unit) in date_meta.items(): + if not reached_self_unit: + res_col = ( + self % get_np_td_unit_conversion(mod_unit, self.dtype) + ) // get_np_td_unit_conversion(div_unit, self.dtype) + reached_self_unit = self.time_unit == div_unit + else: + res_col = column.as_column( + 0, length=len(self), dtype=np.dtype(np.int64) ) - ) // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion[value[1]], "ns" - ).astype(self.dtype) - ) - if self.time_unit == value[1]: - break - - for name in keys_list: - res_col = column.as_column(0, length=len(self), dtype="int64") - if self.nullable: - res_col = res_col.set_mask(self.mask) - data[name] = res_col + if self.nullable: + res_col = res_col.set_mask(self.mask) + data[result_key] = res_col return data @property - def days(self) -> "cudf.core.column.NumericalColumn": + def days(self) -> cudf.core.column.NumericalColumn: """ Number of days for each element. @@ -558,14 +527,10 @@ def days(self) -> "cudf.core.column.NumericalColumn": ------- NumericalColumn """ - return self // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( - self.dtype - ) - ) + return self // get_np_td_unit_conversion("D", self.dtype) @property - def seconds(self) -> "cudf.core.column.NumericalColumn": + def seconds(self) -> cudf.core.column.NumericalColumn: """ Number of seconds (>= 0 and less than 1 day). @@ -579,18 +544,11 @@ def seconds(self) -> "cudf.core.column.NumericalColumn": # division operation to extract the number of seconds. return ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) - ) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") - ) + self % get_np_td_unit_conversion("D", self.dtype) + ) // get_np_td_unit_conversion("s", None) @property - def microseconds(self) -> "cudf.core.column.NumericalColumn": + def microseconds(self) -> cudf.core.column.NumericalColumn: """ Number of microseconds (>= 0 and less than 1 second). @@ -604,16 +562,11 @@ def microseconds(self) -> "cudf.core.column.NumericalColumn": # division operation to extract the number of microseconds. return ( - self - % np.timedelta64( - _unit_to_nanoseconds_conversion["s"], "ns" - ).astype(self.dtype) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ) + self % get_np_td_unit_conversion("s", self.dtype) + ) // get_np_td_unit_conversion("us", None) @property - def nanoseconds(self) -> "cudf.core.column.NumericalColumn": + def nanoseconds(self) -> cudf.core.column.NumericalColumn: """ Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. @@ -633,13 +586,8 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": res_col = res_col.set_mask(self.mask) return cast("cudf.core.column.NumericalColumn", res_col) return ( - self - % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns") - ) + self % get_np_td_unit_conversion("us", None) + ) // get_np_td_unit_conversion("ns", None) def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 08f8e49a98c..fcf5a3cd8e9 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1644,7 +1644,7 @@ def __neg__(self): ( col.unary_operator("not") if col.dtype.kind == "b" - else -1 * col + else col.unary_operator("negate") for col in self._columns ) ) diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py index d5e669cb58f..1fc53bbbaae 100644 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ b/python/cudf/cudf/pandas/_wrappers/numpy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -149,6 +149,7 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs): # Special wrapping to handle scalar values "_fsproxy_wrap": classmethod(wrap_ndarray), "base": _FastSlowAttribute("base", private=True), + "data": _FastSlowAttribute("data", private=True), }, ) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index bbd01eaa311..7ed0d370822 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -3,6 +3,7 @@ import itertools import operator import re +from decimal import Decimal import numpy as np import pandas as pd @@ -134,3 +135,9 @@ def test_series_bool_neg(): sr = Series([True, False, True, None, False, None, True, True]) psr = sr.to_pandas(nullable=True) assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True) + + +def test_series_decimal_neg(): + sr = Series([Decimal("0.0"), Decimal("1.23"), Decimal("4.567")]) + psr = sr.to_pandas() + assert_eq((-sr).to_pandas(), -psr, check_dtype=True) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 938d22de076..3e8b6d5786c 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1927,3 +1927,12 @@ def test_series_dtype_property(): expected = np.dtype(s) actual = np.dtype(xs) assert expected == actual + + +def test_numpy_data_access(): + s = pd.Series([1, 2, 3]) + xs = xpd.Series([1, 2, 3]) + expected = s.values.data + actual = xs.values.data + + assert type(expected) is type(actual) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index 03068d2268a..977d25184b5 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -182,7 +182,7 @@ dependencies: common: - output_types: conda packages: - - cudf==25.2.*,>=0.0.0a0 + - cudf==25.4.*,>=0.0.0a0 - pandas - pytest - pytest-xdist @@ -248,13 +248,13 @@ dependencies: common: - output_types: conda packages: - - cuml==25.2.*,>=0.0.0a0 + - cuml==25.4.*,>=0.0.0a0 - scikit-learn test_cugraph: common: - output_types: conda packages: - - cugraph==25.2.*,>=0.0.0a0 + - cugraph==25.4.*,>=0.0.0a0 - networkx test_ibis: common: diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py index 33b6ffdbd5c..553d9c4459e 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py @@ -1,7 +1,8 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations +import glob import os import pickle from typing import TYPE_CHECKING, BinaryIO @@ -75,23 +76,40 @@ def swap_xfail(item: _pytest.nodes.Item, name: str): swap_xfail(item, "xfail_compare") +def get_full_nodeid(pyfuncitem): + # Get the full path to the test file + filepath = pyfuncitem.path + # Get the test name and any parameters + test_name = "::".join(pyfuncitem.nodeid.split("::")[1:]) + # Combine the full file path with the test name + full_nodeid = f"{filepath}::{test_name}" + return full_nodeid + + +def read_all_results(pattern): + results = {} + for filepath in glob.glob(pattern): + with open(filepath, "rb") as f: + results.update(dict(read_results(f))) + return results + + def pytest_configure(config: _pytest.config.Config): gold_basename = "results-gold" cudf_basename = "results-cudf-pandas" test_folder = os.path.join(os.path.dirname(__file__)) if config.getoption("--compare"): - # Everyone reads everything - gold_path = os.path.join(test_folder, f"{gold_basename}.pickle") - cudf_path = os.path.join(test_folder, f"{cudf_basename}.pickle") + gold_path = os.path.join(test_folder, f"{gold_basename}*.pickle") + cudf_path = os.path.join(test_folder, f"{cudf_basename}*.pickle") with disable_module_accelerator(): - with open(gold_path, "rb") as f: - gold_results = dict(read_results(f)) - with open(cudf_path, "rb") as f: - cudf_results = dict(read_results(f)) + gold_results = read_all_results(gold_path) + cudf_results = read_all_results(cudf_path) config.stash[results] = (gold_results, cudf_results) else: - if "cudf.pandas" in config.option.plugins: + if any( + plugin.strip() == "cudf.pandas" for plugin in config.option.plugins + ): basename = cudf_basename else: basename = gold_basename @@ -112,7 +130,7 @@ def pytest_configure(config: _pytest.config.Config): def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function): if pyfuncitem.config.getoption("--compare"): gold_results, cudf_results = pyfuncitem.config.stash[results] - key = pyfuncitem.nodeid + key = get_full_nodeid(pyfuncitem) try: gold = gold_results[key] except KeyError: @@ -140,7 +158,7 @@ def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function): # Tuple-based key-value pairs, key is the node-id try: pickle.dump( - (pyfuncitem.nodeid, result), + (get_full_nodeid(pyfuncitem), result), pyfuncitem.config.stash[file_handle_key], ) except pickle.PicklingError: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index bd2a710e84a..33c8e041b88 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cuda-python>=11.8.5,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "libcudf==25.2.*,>=0.0.0a0", + "libcudf==25.4.*,>=0.0.0a0", "numba-cuda>=0.2.0,<0.3.0a0", "numba>=0.59.1,<0.61.0a0", "numpy>=1.23,<3.0a0", @@ -33,9 +33,9 @@ dependencies = [ "ptxcompiler", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", - "pylibcudf==25.2.*,>=0.0.0a0", + "pylibcudf==25.4.*,>=0.0.0a0", "rich", - "rmm==25.2.*,>=0.0.0a0", + "rmm==25.4.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -120,11 +120,11 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", - "libcudf==25.2.*,>=0.0.0a0", - "librmm==25.2.*,>=0.0.0a0", + "libcudf==25.4.*,>=0.0.0a0", + "librmm==25.4.*,>=0.0.0a0", "ninja", - "pylibcudf==25.2.*,>=0.0.0a0", - "rmm==25.2.*,>=0.0.0a0", + "pylibcudf==25.4.*,>=0.0.0a0", + "rmm==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index a9d937435e9..a1f15574d2d 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==25.2.*,>=0.0.0a0", + "cudf==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 3336c901e7f..3286c9ff8bc 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 # TODO: remove need for this """DSL nodes for unary operations.""" @@ -119,6 +119,7 @@ class UnaryFunction(Expr): "abs": plc.unary.UnaryOperator.ABS, "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, "not": plc.unary.UnaryOperator.NOT, + "negate": plc.unary.UnaryOperator.NEGATE, } _supported_misc_fns = frozenset( { diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py index 8f660632df2..36c7745c3f4 100644 --- a/python/cudf_polars/cudf_polars/experimental/base.py +++ b/python/cudf_polars/cudf_polars/experimental/base.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Multi-partition base classes.""" @@ -12,20 +12,26 @@ from collections.abc import Iterator, Sequence from cudf_polars.containers import DataFrame + from cudf_polars.dsl.expr import NamedExpr from cudf_polars.dsl.nodebase import Node class PartitionInfo: - """ - Partitioning information. - - This class only tracks the partition count (for now). - """ - - __slots__ = ("count",) - - def __init__(self, count: int): + """Partitioning information.""" + + __slots__ = ("count", "partitioned_on") + count: int + """Partition count.""" + partitioned_on: tuple[NamedExpr, ...] + """Columns the data is hash-partitioned on.""" + + def __init__( + self, + count: int, + partitioned_on: tuple[NamedExpr, ...] = (), + ): self.count = count + self.partitioned_on = partitioned_on def keys(self, node: Node) -> Iterator[tuple[str, int]]: """Return the partitioned keys for a given node.""" diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py index 6843ed9ee2e..5a5eaab8b2f 100644 --- a/python/cudf_polars/cudf_polars/experimental/parallel.py +++ b/python/cudf_polars/cudf_polars/experimental/parallel.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Multi-partition Dask execution.""" @@ -10,7 +10,8 @@ from typing import TYPE_CHECKING, Any import cudf_polars.experimental.io -import cudf_polars.experimental.select # noqa: F401 +import cudf_polars.experimental.select +import cudf_polars.experimental.shuffle # noqa: F401 from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union from cudf_polars.dsl.traversal import CachingVisitor, traversal from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py new file mode 100644 index 00000000000..d49f13375ed --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py @@ -0,0 +1,204 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +"""Shuffle Logic.""" + +from __future__ import annotations + +import json +import operator +from typing import TYPE_CHECKING, Any + +import pyarrow as pa + +import pylibcudf as plc + +from cudf_polars.containers import DataFrame +from cudf_polars.dsl.ir import IR +from cudf_polars.experimental.base import _concat, get_key_name +from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node + +if TYPE_CHECKING: + from collections.abc import Hashable, MutableMapping + + from cudf_polars.dsl.expr import NamedExpr + from cudf_polars.experimental.dispatch import LowerIRTransformer + from cudf_polars.experimental.parallel import PartitionInfo + from cudf_polars.typing import Schema + + +class Shuffle(IR): + """ + Shuffle multi-partition data. + + Notes + ----- + Only hash-based partitioning is supported (for now). + """ + + __slots__ = ("keys", "options") + _non_child = ("schema", "keys", "options") + keys: tuple[NamedExpr, ...] + """Keys to shuffle on.""" + options: dict[str, Any] + """Shuffling options.""" + + def __init__( + self, + schema: Schema, + keys: tuple[NamedExpr, ...], + options: dict[str, Any], + df: IR, + ): + self.schema = schema + self.keys = keys + self.options = options + self._non_child_args = (schema, keys, options) + self.children = (df,) + + def get_hashable(self) -> Hashable: + """Hashable representation of the node.""" + return ( + type(self), + tuple(self.schema.items()), + self.keys, + json.dumps(self.options), + self.children, + ) + + @classmethod + def do_evaluate( + cls, + schema: Schema, + keys: tuple[NamedExpr, ...], + options: dict[str, Any], + df: DataFrame, + ): # pragma: no cover + """Evaluate and return a dataframe.""" + # Single-partition Shuffle evaluation is a no-op + return df + + +def _partition_dataframe( + df: DataFrame, + keys: tuple[NamedExpr, ...], + count: int, +) -> dict[int, DataFrame]: + """ + Partition an input DataFrame for shuffling. + + Notes + ----- + This utility only supports hash partitioning (for now). + + Parameters + ---------- + df + DataFrame to partition. + keys + Shuffle key(s). + count + Total number of output partitions. + + Returns + ------- + A dictionary mapping between int partition indices and + DataFrame fragments. + """ + # Hash the specified keys to calculate the output + # partition for each row + partition_map = plc.binaryop.binary_operation( + plc.hashing.murmurhash3_x86_32( + DataFrame([expr.evaluate(df) for expr in keys]).table + ), + plc.interop.from_arrow(pa.scalar(count, type="uint32")), + plc.binaryop.BinaryOperator.PYMOD, + plc.types.DataType(plc.types.TypeId.UINT32), + ) + + # Apply partitioning + t, offsets = plc.partitioning.partition( + df.table, + partition_map, + count, + ) + + # Split and return the partitioned result + return { + i: DataFrame.from_table( + split, + df.column_names, + ) + for i, split in enumerate(plc.copying.split(t, offsets[1:-1])) + } + + +def _simple_shuffle_graph( + name_out: str, + name_in: str, + keys: tuple[NamedExpr, ...], + count_in: int, + count_out: int, +) -> MutableMapping[Any, Any]: + """Make a simple all-to-all shuffle graph.""" + split_name = f"split-{name_out}" + inter_name = f"inter-{name_out}" + + graph: MutableMapping[Any, Any] = {} + for part_out in range(count_out): + _concat_list = [] + for part_in in range(count_in): + graph[(split_name, part_in)] = ( + _partition_dataframe, + (name_in, part_in), + keys, + count_out, + ) + _concat_list.append((inter_name, part_out, part_in)) + graph[_concat_list[-1]] = ( + operator.getitem, + (split_name, part_in), + part_out, + ) + graph[(name_out, part_out)] = (_concat, _concat_list) + return graph + + +@lower_ir_node.register(Shuffle) +def _( + ir: Shuffle, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + # Simple lower_ir_node handling for the default hash-based shuffle. + # More-complex logic (e.g. joining and sorting) should + # be handled separately. + from cudf_polars.experimental.parallel import PartitionInfo + + (child,) = ir.children + + new_child, pi = rec(child) + if pi[new_child].count == 1 or ir.keys == pi[new_child].partitioned_on: + # Already shuffled + return new_child, pi + new_node = ir.reconstruct([new_child]) + pi[new_node] = PartitionInfo( + # Default shuffle preserves partition count + count=pi[new_child].count, + # Add partitioned_on info + partitioned_on=ir.keys, + ) + return new_node, pi + + +@generate_ir_tasks.register(Shuffle) +def _( + ir: Shuffle, partition_info: MutableMapping[IR, PartitionInfo] +) -> MutableMapping[Any, Any]: + # Use a simple all-to-all shuffle graph. + + # TODO: Optionally use rapidsmp. + return _simple_shuffle_graph( + get_key_name(ir), + get_key_name(ir.children[0]), + ir.keys, + partition_info[ir.children[0]].count, + partition_info[ir].count, + ) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index a8cad5622fb..be48d500a36 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -8,7 +8,7 @@ You will need: preferred configuration. Or else, use [rustup](https://www.rust-lang.org/tools/install) 2. A [cudf development - environment](https://github.com/rapidsai/cudf/blob/branch-25.02/CONTRIBUTING.md#setting-up-your-build-environment). + environment](https://github.com/rapidsai/cudf/blob/branch-25.04/CONTRIBUTING.md#setting-up-your-build-environment). The combined devcontainer works, or whatever your favourite approach is. :::{note} diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 15547f85d56..805d7925bb4 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "polars>=1.20,<1.22", - "pylibcudf==25.2.*,>=0.0.0a0", + "pylibcudf==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -41,7 +41,7 @@ test = [ "pytest<8", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. experimental = [ - "rapids-dask-dependency==25.2.*,>=0.0.0a0", + "rapids-dask-dependency==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] diff --git a/python/cudf_polars/tests/experimental/test_shuffle.py b/python/cudf_polars/tests/experimental/test_shuffle.py new file mode 100644 index 00000000000..294557fd0d6 --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_shuffle.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl +from polars.testing import assert_frame_equal + +from cudf_polars import Translator +from cudf_polars.dsl.expr import Col, NamedExpr +from cudf_polars.experimental.parallel import evaluate_dask, lower_ir_graph +from cudf_polars.experimental.shuffle import Shuffle + + +@pytest.fixture(scope="module") +def engine(): + return pl.GPUEngine( + raise_on_fail=True, + executor="dask-experimental", + executor_options={"max_rows_per_partition": 4}, + ) + + +@pytest.fixture(scope="module") +def df(): + return pl.LazyFrame( + { + "x": [1, 2, 3, 4, 5, 6, 7], + "y": [1, 1, 1, 1, 1, 1, 1], + "z": ["a", "b", "c", "d", "e", "f", "g"], + } + ) + + +def test_hash_shuffle(df, engine): + # Extract translated IR + qir = Translator(df._ldf.visit(), engine).translate_ir() + + # Add first Shuffle node + keys = (NamedExpr("x", Col(qir.schema["x"], "x")),) + options = {} + qir1 = Shuffle(qir.schema, keys, options, qir) + + # Add second Shuffle node (on the same keys) + qir2 = Shuffle(qir.schema, keys, options, qir1) + + # Check that sequential shuffles on the same keys + # are replaced with a single shuffle node + partition_info = lower_ir_graph(qir2)[1] + assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 1 + + # Add second Shuffle node (on different keys) + keys2 = (NamedExpr("z", Col(qir.schema["z"], "z")),) + qir3 = Shuffle(qir2.schema, keys2, options, qir2) + + # Check that we have an additional shuffle + # node after shuffling on different keys + partition_info = lower_ir_graph(qir3)[1] + assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 2 + + # Check that Dask evaluation works + result = evaluate_dask(qir3).to_polars() + expect = df.collect(engine="cpu") + assert_frame_equal(result, expect, check_row_order=False) diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py index ac3aecf88e6..75bf0960e10 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py @@ -1,7 +1,9 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from datetime import timedelta + import numpy as np import pytest @@ -58,6 +60,7 @@ def ldf(with_nulls, dtype): { "a": pl.Series(values, dtype=dtype), "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32), + "c": pl.Series([timedelta(hours=i) for i in range(len(values))]), } ) @@ -89,3 +92,9 @@ def test_log(ldf, natural): q = ldf.select(expr) assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("col", ["a", "b", "c"]) +def test_negate(ldf, col): + q = ldf.select(-pl.col(col)) + assert_gpu_result_equal(q) diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 7820157d89b..665b0a76ecf 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -20,8 +20,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "confluent-kafka>=2.5.0,<2.6.0a0", - "cudf==25.2.*,>=0.0.0a0", - "cudf_kafka==25.2.*,>=0.0.0a0", + "cudf==25.4.*,>=0.0.0a0", + "cudf_kafka==25.4.*,>=0.0.0a0", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 3725722a8ae..87bf282f376 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -19,13 +19,13 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==25.2.*,>=0.0.0a0", + "cudf==25.4.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", "pandas>=2.0,<2.2.4dev0", "pynvml>=12.0.0,<13.0.0a0", - "rapids-dask-dependency==25.2.*,>=0.0.0a0", + "rapids-dask-dependency==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -46,7 +46,7 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ - "dask-cuda==25.2.*,>=0.0.0a0", + "dask-cuda==25.4.*,>=0.0.0a0", "numba-cuda>=0.2.0,<0.3.0a0", "numba>=0.59.1,<0.61.0a0", "pytest-cov", diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index d16ad97ec54..7d3b6c09c61 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ "Environment :: GPU :: NVIDIA CUDA", ] dependencies = [ - "libkvikio==25.2.*,>=0.0.0a0", + "libkvikio==25.4.*,>=0.0.0a0", "nvidia-nvcomp==4.1.0.6", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -78,7 +78,7 @@ dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" requires = [ "cmake>=3.26.4,!=3.30.0", - "libkvikio==25.2.*,>=0.0.0a0", - "librmm==25.2.*,>=0.0.0a0", + "libkvikio==25.4.*,>=0.0.0a0", + "librmm==25.4.*,>=0.0.0a0", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd index 4666012623e..802d4b392a8 100644 --- a/python/pylibcudf/pylibcudf/libcudf/unary.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -33,6 +33,7 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil: RINT BIT_INVERT NOT + NEGATE cdef extern unique_ptr[column] unary_operation( column_view input, diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi index 7aa23b618f4..4d06a51c03a 100644 --- a/python/pylibcudf/pylibcudf/unary.pyi +++ b/python/pylibcudf/pylibcudf/unary.pyi @@ -28,6 +28,7 @@ class UnaryOperator(IntEnum): RINT = ... BIT_INVERT = ... NOT = ... + NEGATE = ... def unary_operation(input: Column, op: UnaryOperator) -> Column: ... def is_null(input: Column) -> Column: ... diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index efa3d301334..300138c9b4a 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -19,12 +19,12 @@ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ "cuda-python>=11.8.5,<12.0a0", - "libcudf==25.2.*,>=0.0.0a0", + "libcudf==25.4.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", - "rmm==25.2.*,>=0.0.0a0", + "rmm==25.4.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -111,10 +111,10 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", - "libcudf==25.2.*,>=0.0.0a0", - "librmm==25.2.*,>=0.0.0a0", + "libcudf==25.4.*,>=0.0.0a0", + "librmm==25.4.*,>=0.0.0a0", "ninja", - "rmm==25.2.*,>=0.0.0a0", + "rmm==25.4.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build]