From 535db9b26ed1a57e4275f4a6f11b04ebeee21248 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:28:14 -0700 Subject: [PATCH] Deprecate Arrow support in I/O (#16132) Contributes to https://github.com/rapidsai/cudf/issues/15193 Authors: - Thomas Li (https://github.com/lithomas1) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16132 --- .../cudf/_lib/pylibcudf/io/datasource.pyx | 10 +- python/cudf/cudf/io/csv.py | 2 +- python/cudf/cudf/io/orc.py | 33 +++-- python/cudf/cudf/io/parquet.py | 40 ++++-- .../io/test_source_sink_info.py | 21 +-- python/cudf/cudf/tests/test_csv.py | 5 +- python/cudf/cudf/tests/test_gcs.py | 3 +- python/cudf/cudf/tests/test_parquet.py | 19 +-- python/cudf/cudf/tests/test_s3.py | 136 ++++++++++-------- python/cudf/cudf/utils/ioutils.py | 78 ++++++++-- python/cudf/cudf/utils/utils.py | 26 ++++ .../dask_cudf/dask_cudf/io/tests/test_s3.py | 6 +- 12 files changed, 247 insertions(+), 132 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx index aa7fa0efdaf..8f265f585de 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx @@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource +import warnings + cdef class Datasource: cdef datasource* get_datasource(self) except * nogil: @@ -16,10 +18,16 @@ cdef class Datasource: cdef class NativeFileDatasource(Datasource): - def __cinit__(self, NativeFile native_file,): + def __cinit__(self, NativeFile native_file): cdef shared_ptr[CRandomAccessFile] ra_src + warnings.warn( + "Support for reading pyarrow's NativeFile is deprecated " + "and will be removed in a future release of cudf.", + FutureWarning, + ) + ra_src = native_file.get_random_access_file() self.c_datasource.reset(new arrow_io_source(ra_src)) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index e909d96309e..0f2820a01e9 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -50,7 +50,7 @@ def read_csv( comment=None, delim_whitespace=False, byte_range=None, - use_python_file_object=True, + use_python_file_object=None, storage_options=None, bytes_per_thread=None, ): diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 7082a85237a..289292b5182 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -10,6 +10,7 @@ from cudf._lib import orc as liborc from cudf.api.types import is_list_like from cudf.utils import ioutils +from cudf.utils.utils import maybe_filter_deprecation def _make_empty_df(filepath_or_buffer, columns): @@ -280,7 +281,7 @@ def read_orc( num_rows=None, use_index=True, timestamp_type=None, - use_python_file_object=True, + use_python_file_object=None, storage_options=None, bytes_per_thread=None, ): @@ -320,6 +321,9 @@ def read_orc( ) filepaths_or_buffers = [] + have_nativefile = any( + isinstance(source, pa.NativeFile) for source in filepath_or_buffer + ) for source in filepath_or_buffer: if ioutils.is_directory( path_or_data=source, storage_options=storage_options @@ -360,17 +364,24 @@ def read_orc( stripes = selected_stripes if engine == "cudf": - return DataFrame._from_data( - *liborc.read_orc( - filepaths_or_buffers, - columns, - stripes, - skiprows, - num_rows, - use_index, - timestamp_type, + # Don't want to warn if use_python_file_object causes us to get + # a NativeFile (there is a separate deprecation warning for that) + with maybe_filter_deprecation( + not have_nativefile, + message="Support for reading pyarrow's NativeFile is deprecated", + category=FutureWarning, + ): + return DataFrame._from_data( + *liborc.read_orc( + filepaths_or_buffers, + columns, + stripes, + skiprows, + num_rows, + use_index, + timestamp_type, + ) ) - ) else: from pyarrow import orc diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 02b26ea1c01..0f0a240b5d0 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -15,6 +15,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from pyarrow import dataset as ds import cudf @@ -23,6 +24,7 @@ from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking +from cudf.utils.utils import maybe_filter_deprecation BYTE_SIZES = { "kb": 1000, @@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer): path_or_data=source, compression=None, fs=fs, - use_python_file_object=True, + use_python_file_object=None, open_file_options=None, storage_options=None, bytes_per_thread=None, @@ -532,7 +534,7 @@ def read_parquet( filters=None, row_groups=None, use_pandas_metadata=True, - use_python_file_object=True, + use_python_file_object=None, categorical_partitions=True, open_file_options=None, bytes_per_thread=None, @@ -615,6 +617,9 @@ def read_parquet( row_groups=row_groups, fs=fs, ) + have_nativefile = any( + isinstance(source, pa.NativeFile) for source in filepath_or_buffer + ) for source in filepath_or_buffer: tmp_source, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=source, @@ -662,19 +667,26 @@ def read_parquet( ) # Convert parquet data to a cudf.DataFrame - df = _parquet_to_frame( - filepaths_or_buffers, - engine, - *args, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - partition_keys=partition_keys, - partition_categories=partition_categories, - dataset_kwargs=dataset_kwargs, - **kwargs, - ) + # Don't want to warn if use_python_file_object causes us to get + # a NativeFile (there is a separate deprecation warning for that) + with maybe_filter_deprecation( + not have_nativefile, + message="Support for reading pyarrow's NativeFile is deprecated", + category=FutureWarning, + ): + df = _parquet_to_frame( + filepaths_or_buffers, + engine, + *args, + columns=columns, + row_groups=row_groups, + use_pandas_metadata=use_pandas_metadata, + partition_keys=partition_keys, + partition_categories=partition_categories, + dataset_kwargs=dataset_kwargs, + **kwargs, + ) # Apply filters row-wise (if any are defined), and return df = _apply_post_filters(df, filters) if projected_columns: diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py index 287dd8f21c8..438c482b77a 100644 --- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py +++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py @@ -2,11 +2,9 @@ import io -import pyarrow as pa import pytest import cudf._lib.pylibcudf as plc -from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) @@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink): """ Skip invalid sinks for SinkInfo """ - if io_class is plc.io.SinkInfo and isinstance( - sink, (bytes, NativeFileDatasource) - ): - pytest.skip(f"{sink} is not a valid input for SinkInfo") + if io_class is plc.io.SinkInfo and isinstance(sink, bytes): + pytest.skip("bytes is not a valid input for SinkInfo") @pytest.mark.parametrize( @@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink): "a.txt", b"hello world", io.BytesIO(b"hello world"), - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), ], ) def test_source_info_ctor(io_class, source, tmp_path): @@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path): @pytest.mark.parametrize( "sources", [ + ["a.txt"], + [b"hello world"], + [io.BytesIO(b"hello world")], ["a.txt", "a.txt"], [b"hello world", b"hello there"], [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], - [ - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), - ], ], ) def test_source_info_ctor_multiple(io_class, sources, tmp_path): @@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path): io.BytesIO(b"hello there"), b"hello world", ], - [ - NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")), - "awef.txt", - b"hello world", - ], ], ) def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 0525b02b698..6a21cb1b9d7 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf): # Arrow FileSystem interface expect = cudf.read_csv(path_or_buf("filepath")) fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath")) - with fs.open_input_file(path) as fil: - got = cudf.read_csv(fil) + with pytest.warns(FutureWarning): + with fs.open_input_file(path) as fil: + got = cudf.read_csv(fil) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index fc22d8bc0ea..28fdfb5c2f1 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -46,7 +46,8 @@ def mock_size(*args): # use_python_file_object=True, because the pyarrow # `open_input_file` command will fail (since it doesn't # use the monkey-patched `open` definition) - got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False) + with pytest.warns(FutureWarning): + got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False) assert_eq(pdf, got) # AbstractBufferedFile -> PythonFile conversion diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index ecb7fd44422..f2820d9c112 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf): expect = cudf.read_parquet(parquet_path_or_buf("filepath")) fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath")) with fs.open_input_file(path) as fil: - got = cudf.read_parquet(fil) + with pytest.warns(FutureWarning): + got = cudf.read_parquet(fil) assert_eq(expect, got) @@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object( fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath")) # Pass open fsspec file - with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet( - fil, use_python_file_object=use_python_file_object - ) + with pytest.warns(FutureWarning): + with fs.open(paths[0], mode="rb") as fil: + got1 = cudf.read_parquet( + fil, use_python_file_object=use_python_file_object + ) assert_eq(expect, got1) # Pass path only - got2 = cudf.read_parquet( - paths[0], use_python_file_object=use_python_file_object - ) + with pytest.warns(FutureWarning): + got2 = cudf.read_parquet( + paths[0], use_python_file_object=use_python_file_object + ) assert_eq(expect, got2) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index a44bf791767..3ae318d3bf5 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread): buffer = pdf.to_csv(index=False) # Use fsspec file object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - use_python_file_object=False, - ) + with pytest.warns(FutureWarning): + with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): + got = cudf.read_csv( + f"s3://{bucket}/{fname}", + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + use_python_file_object=False, + ) assert_eq(pdf, got) # Use Arrow PythonFile object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - use_python_file_object=True, - ) + with pytest.warns(FutureWarning): + with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): + got = cudf.read_csv( + f"s3://{bucket}/{fname}", + storage_options=s3so, + use_python_file_object=True, + ) assert_eq(pdf, got) @@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) - with fs.open_input_file(f"{bucket}/{fname}") as fil: - got = cudf.read_csv(fil) + with pytest.warns(FutureWarning): + with fs.open_input_file(f"{bucket}/{fname}") as fil: + got = cudf.read_csv(fil) assert_eq(pdf, got) @@ -184,17 +187,18 @@ def test_read_csv_byte_range( # Use fsspec file object with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - byte_range=(74, 73), - bytes_per_thread=bytes_per_thread - if not use_python_file_object - else None, - header=None, - names=["Integer", "Float", "Integer2", "String", "Boolean"], - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got = cudf.read_csv( + f"s3://{bucket}/{fname}", + storage_options=s3so, + byte_range=(74, 73), + bytes_per_thread=bytes_per_thread + if not use_python_file_object + else None, + header=None, + names=["Integer", "Float", "Integer2", "String", "Boolean"], + use_python_file_object=use_python_file_object, + ) assert_eq(pdf.iloc[-2:].reset_index(drop=True), got) @@ -241,18 +245,19 @@ def test_read_parquet( # Check direct path handling buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got1 = cudf.read_parquet( - f"s3://{bucket}/{fname}", - open_file_options=( - {"precache_options": {"method": precache}} - if use_python_file_object - else None - ), - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - columns=columns, - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got1 = cudf.read_parquet( + f"s3://{bucket}/{fname}", + open_file_options=( + {"precache_options": {"method": precache}} + if use_python_file_object + else None + ), + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + columns=columns, + use_python_file_object=use_python_file_object, + ) expect = pdf[columns] if columns else pdf assert_eq(expect, got1) @@ -263,12 +268,13 @@ def test_read_parquet( f"s3://{bucket}/{fname}", storage_options=s3so )[0] with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f: - got2 = cudf.read_parquet( - f, - bytes_per_thread=bytes_per_thread, - columns=columns, - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got2 = cudf.read_parquet( + f, + bytes_per_thread=bytes_per_thread, + columns=columns, + use_python_file_object=use_python_file_object, + ) assert_eq(expect, got2) @@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns): pdf.to_parquet(path=buffer) buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - fs = pa_fs.S3FileSystem( - endpoint_override=s3so["client_kwargs"]["endpoint_url"], - ) - with fs.open_input_file(f"{bucket}/{fname}") as fil: - got = cudf.read_parquet(fil, columns=columns) + with pytest.warns(FutureWarning): + fs = pa_fs.S3FileSystem( + endpoint_override=s3so["client_kwargs"]["endpoint_url"], + ) + with fs.open_input_file(f"{bucket}/{fname}") as fil: + got = cudf.read_parquet(fil, columns=columns) expect = pdf[columns] if columns else pdf assert_eq(expect, got) @@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache): buffer.seek(0) filters = [("String", "==", "Omega")] with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - filters=filters, - open_file_options={"precache_options": {"method": precache}}, - ) + with pytest.warns(FutureWarning): + got = cudf.read_parquet( + f"s3://{bucket}/{fname}", + storage_options=s3so, + filters=filters, + open_file_options={"precache_options": {"method": precache}}, + ) # All row-groups should be filtered out assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True)) @@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns): buffer = f.read() with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_orc( - f"s3://{bucket}/{fname}", - columns=columns, - storage_options=s3so, - use_python_file_object=use_python_file_object, - ) + with pytest.warns(FutureWarning): + got = cudf.read_orc( + f"s3://{bucket}/{fname}", + columns=columns, + storage_options=s3so, + use_python_file_object=use_python_file_object, + ) if columns: expect = expect[columns] @@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns): fs = pa_fs.S3FileSystem( endpoint_override=s3so["client_kwargs"]["endpoint_url"], ) - with fs.open_input_file(f"{bucket}/{fname}") as fil: - got = cudf.read_orc(fil, columns=columns) + with pytest.warns(FutureWarning): + with fs.open_input_file(f"{bucket}/{fname}") as fil: + got = cudf.read_orc(fil, columns=columns) if columns: expect = expect[columns] diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 76c7f2bfdb8..80555750b3a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -6,6 +6,7 @@ import warnings from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper from threading import Thread +from typing import Callable import fsspec import fsspec.implementations.local @@ -15,6 +16,7 @@ from pyarrow import PythonFile as ArrowPythonFile from pyarrow.lib import NativeFile +from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial @@ -24,7 +26,6 @@ except ImportError: fsspec_parquet = None - _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 @@ -86,7 +87,7 @@ 1 20 rapids 2 30 ai """.format(remote_data_sources=_docstring_remote_sources) -doc_read_avro = docfmt_partial(docstring=_docstring_read_avro) +doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro) _docstring_read_parquet_metadata = """ Read a Parquet file's metadata and schema @@ -174,15 +175,23 @@ columns are also loaded. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. Setting this argument to `False` - will require the entire file to be copied to host memory, and is highly - discouraged. + AbstractBufferedFile objects at IO time. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers in the future. open_file_options : dict, optional Dictionary of key-value pairs to pass to the function used to open remote files. By default, this will be `fsspec.parquet.open_parquet_file`. To deactivate optimized precaching, set the "method" to `None` under the "precache_options" key. Note that the `open_file_func` key can also be used to specify a custom file-open function. + + .. deprecated:: 24.08 + `open_file_options` is deprecated as it was intended for + pyarrow file inputs, which will no longer be accepted as + input/output cudf readers/writers in the future. bytes_per_thread : int, default None Determines the number of bytes to be allocated per thread to read the files in parallel. When there is a file of large size, we get slightly @@ -468,8 +477,12 @@ If True, use row index if available for faster seeking. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. This option is likely to improve - performance when making small reads from larger ORC files. + AbstractBufferedFile objects at IO time. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers in the future. storage_options : dict, optional, default None Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value @@ -934,7 +947,7 @@ -------- cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame. """ -doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf) +doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf) _docstring_to_hdf = """ Write the contained data to an HDF5 file using HDFStore. @@ -1006,7 +1019,7 @@ cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format. cudf.DataFrame.to_feather : Write out feather-format for DataFrames. """ -doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf) +doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf) _docstring_read_feather = """ Load an feather object from the file path, returning a DataFrame. @@ -1188,8 +1201,12 @@ the end of the range. use_python_file_object : boolean, default True If True, Arrow-backed PythonFile objects will be used in place of fsspec - AbstractBufferedFile objects at IO time. This option is likely to improve - performance when making small reads from larger CSV files. + AbstractBufferedFile objects at IO time. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers in the future. storage_options : dict, optional, default None Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value @@ -1409,7 +1426,7 @@ result : Series """ -doc_read_text = docfmt_partial(docstring=_docstring_text_datasource) +doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource) _docstring_get_reader_filepath_or_buffer = """ @@ -1430,9 +1447,19 @@ use_python_file_object : boolean, default False If True, Arrow-backed PythonFile objects will be used in place of fsspec AbstractBufferedFile objects. + + .. deprecated:: 24.08 + `use_python_file_object` is deprecated and will be removed in a future + version of cudf, as PyArrow NativeFiles will no longer be accepted as + input/output in cudf readers/writers. open_file_options : dict, optional Optional dictionary of keyword arguments to pass to `_open_remote_files` (used for remote storage only). + + .. deprecated:: 24.08 + `open_file_options` is deprecated as it was intended for + pyarrow file inputs, which will no longer be accepted as + input/output cudf readers/writers in the future. allow_raw_text_input : boolean, default False If True, this indicates the input `path_or_data` could be a raw text input and will not check for its existence in the filesystem. If False, @@ -1708,7 +1735,8 @@ def get_reader_filepath_or_buffer( mode="rb", fs=None, iotypes=(BytesIO, NativeFile), - use_python_file_object=False, + # no_default aliases to False + use_python_file_object=no_default, open_file_options=None, allow_raw_text_input=False, storage_options=None, @@ -1720,6 +1748,30 @@ def get_reader_filepath_or_buffer( path_or_data = stringify_pathlike(path_or_data) + if use_python_file_object is no_default: + use_python_file_object = False + elif use_python_file_object is not None: + warnings.warn( + "The 'use_python_file_object' keyword is deprecated and " + "will be removed in a future version.", + FutureWarning, + ) + else: + # Preserve the readers (e.g. read_csv) default of True + # if no use_python_file_object option is specified by the user + # for now (note: this is different from the default for this + # function of False) + # TODO: when non-pyarrow file reading perf is good enough + # we can default this to False + use_python_file_object = True + + if open_file_options is not None: + warnings.warn( + "The 'open_file_options' keyword is deprecated and " + "will be removed in a future version.", + FutureWarning, + ) + if isinstance(path_or_data, str): # Get a filesystem object if one isn't already available paths = [path_or_data] diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 7347ec7866a..c9b343e0f9f 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -6,6 +6,7 @@ import os import traceback import warnings +from contextlib import contextmanager import numpy as np import pandas as pd @@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value): if result_mask is not None: result_col = result_col.set_mask(result_mask.as_mask()) return result_col + + +@contextmanager +def maybe_filter_deprecation( + condition: bool, message: str, category: type[Warning] +): + """Conditionally filter a warning category. + + Parameters + ---------- + condition + If true, filter the warning + message + Message to match, passed to :func:`warnings.filterwarnings` + category + Category of warning, passed to :func:`warnings.filterwarnings` + """ + with warnings.catch_warnings(): + if condition: + warnings.filterwarnings( + "ignore", + message, + category=category, + ) + yield diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index a67404da4fe..3947c69aaa5 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options): storage_options=s3so, open_file_options=open_file_options, ) - assert df.a.sum().compute() == 10 - assert df.b.sum().compute() == 9 + with pytest.warns(FutureWarning): + assert df.a.sum().compute() == 10 + with pytest.warns(FutureWarning): + assert df.b.sum().compute() == 9