From 535db9b26ed1a57e4275f4a6f11b04ebeee21248 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:28:14 -0700
Subject: [PATCH] Deprecate Arrow support in I/O (#16132)

Contributes to https://github.com/rapidsai/cudf/issues/15193

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16132
---
 .../cudf/_lib/pylibcudf/io/datasource.pyx     |  10 +-
 python/cudf/cudf/io/csv.py                    |   2 +-
 python/cudf/cudf/io/orc.py                    |  33 +++--
 python/cudf/cudf/io/parquet.py                |  40 ++++--
 .../io/test_source_sink_info.py               |  21 +--
 python/cudf/cudf/tests/test_csv.py            |   5 +-
 python/cudf/cudf/tests/test_gcs.py            |   3 +-
 python/cudf/cudf/tests/test_parquet.py        |  19 +--
 python/cudf/cudf/tests/test_s3.py             | 136 ++++++++++--------
 python/cudf/cudf/utils/ioutils.py             |  78 ++++++++--
 python/cudf/cudf/utils/utils.py               |  26 ++++
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |   6 +-
 12 files changed, 247 insertions(+), 132 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
index aa7fa0efdaf..8f265f585de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
@@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile
 from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
+import warnings
+
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
@@ -16,10 +18,16 @@ cdef class Datasource:
 
 cdef class NativeFileDatasource(Datasource):
 
-    def __cinit__(self, NativeFile native_file,):
+    def __cinit__(self, NativeFile native_file):
 
         cdef shared_ptr[CRandomAccessFile] ra_src
 
+        warnings.warn(
+            "Support for reading pyarrow's NativeFile is deprecated "
+            "and will be removed in a future release of cudf.",
+            FutureWarning,
+        )
+
         ra_src = native_file.get_random_access_file()
         self.c_datasource.reset(new arrow_io_source(ra_src))
 
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e909d96309e..0f2820a01e9 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -50,7 +50,7 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7082a85237a..289292b5182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,6 +10,7 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -320,6 +321,9 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        # Don't want to warn if use_python_file_object causes us to get
+        # a NativeFile (there is a separate deprecation warning for that)
+        with maybe_filter_deprecation(
+            not have_nativefile,
+            message="Support for reading pyarrow's NativeFile is deprecated",
+            category=FutureWarning,
+        ):
+            return DataFrame._from_data(
+                *liborc.read_orc(
+                    filepaths_or_buffers,
+                    columns,
+                    stripes,
+                    skiprows,
+                    num_rows,
+                    use_index,
+                    timestamp_type,
+                )
             )
-        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 02b26ea1c01..0f0a240b5d0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -23,6 +24,7 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=True,
+            use_python_file_object=None,
             open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
@@ -532,7 +534,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=True,
+    use_python_file_object=None,
     categorical_partitions=True,
     open_file_options=None,
     bytes_per_thread=None,
@@ -615,6 +617,9 @@ def read_parquet(
             row_groups=row_groups,
             fs=fs,
         )
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
@@ -662,19 +667,26 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-    df = _parquet_to_frame(
-        filepaths_or_buffers,
-        engine,
-        *args,
-        columns=columns,
-        row_groups=row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        partition_keys=partition_keys,
-        partition_categories=partition_categories,
-        dataset_kwargs=dataset_kwargs,
-        **kwargs,
-    )
 
+    # Don't want to warn if use_python_file_object causes us to get
+    # a NativeFile (there is a separate deprecation warning for that)
+    with maybe_filter_deprecation(
+        not have_nativefile,
+        message="Support for reading pyarrow's NativeFile is deprecated",
+        category=FutureWarning,
+    ):
+        df = _parquet_to_frame(
+            filepaths_or_buffers,
+            engine,
+            *args,
+            columns=columns,
+            row_groups=row_groups,
+            use_pandas_metadata=use_pandas_metadata,
+            partition_keys=partition_keys,
+            partition_categories=partition_categories,
+            dataset_kwargs=dataset_kwargs,
+            **kwargs,
+        )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 287dd8f21c8..438c482b77a 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -2,11 +2,9 @@
 
 import io
 
-import pyarrow as pa
 import pytest
 
 import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink):
     """
     Skip invalid sinks for SinkInfo
     """
-    if io_class is plc.io.SinkInfo and isinstance(
-        sink, (bytes, NativeFileDatasource)
-    ):
-        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+    if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
+        pytest.skip("bytes is not a valid input for SinkInfo")
 
 
 @pytest.mark.parametrize(
@@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink):
         "a.txt",
         b"hello world",
         io.BytesIO(b"hello world"),
-        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
 def test_source_info_ctor(io_class, source, tmp_path):
@@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
 @pytest.mark.parametrize(
     "sources",
     [
+        ["a.txt"],
+        [b"hello world"],
+        [io.BytesIO(b"hello world")],
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-        ],
     ],
 )
 def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            "awef.txt",
-            b"hello world",
-        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0525b02b698..6a21cb1b9d7 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
     # Arrow FileSystem interface
     expect = cudf.read_csv(path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        got = cudf.read_csv(fil)
+    with pytest.warns(FutureWarning):
+        with fs.open_input_file(path) as fil:
+            got = cudf.read_csv(fil)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index fc22d8bc0ea..28fdfb5c2f1 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -46,7 +46,8 @@ def mock_size(*args):
     # use_python_file_object=True, because the pyarrow
     # `open_input_file` command will fail (since it doesn't
     # use the monkey-patched `open` definition)
-    got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+    with pytest.warns(FutureWarning):
+        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ecb7fd44422..f2820d9c112 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
     with fs.open_input_file(path) as fil:
-        got = cudf.read_parquet(fil)
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(fil)
 
     assert_eq(expect, got)
 
@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(
-            fil, use_python_file_object=use_python_file_object
-        )
+    with pytest.warns(FutureWarning):
+        with fs.open(paths[0], mode="rb") as fil:
+            got1 = cudf.read_parquet(
+                fil, use_python_file_object=use_python_file_object
+            )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(
-        paths[0], use_python_file_object=use_python_file_object
-    )
+    with pytest.warns(FutureWarning):
+        got2 = cudf.read_parquet(
+            paths[0], use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index a44bf791767..3ae318d3bf5 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            use_python_file_object=False,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                use_python_file_object=False,
+            )
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            use_python_file_object=True,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
     assert_eq(pdf, got)
 
 
@@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_csv(fil)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
 
@@ -184,17 +187,18 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread
-            if not use_python_file_object
-            else None,
-            header=None,
-            names=["Integer", "Float", "Integer2", "String", "Boolean"],
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                byte_range=(74, 73),
+                bytes_per_thread=bytes_per_thread
+                if not use_python_file_object
+                else None,
+                header=None,
+                names=["Integer", "Float", "Integer2", "String", "Boolean"],
+                use_python_file_object=use_python_file_object,
+            )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -241,18 +245,19 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got1 = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            open_file_options=(
-                {"precache_options": {"method": precache}}
-                if use_python_file_object
-                else None
-            ),
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            columns=columns,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got1 = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                open_file_options=(
+                    {"precache_options": {"method": precache}}
+                    if use_python_file_object
+                    else None
+                ),
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+                use_python_file_object=use_python_file_object,
+            )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -263,12 +268,13 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            got2 = cudf.read_parquet(
-                f,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+            with pytest.warns(FutureWarning):
+                got2 = cudf.read_parquet(
+                    f,
+                    bytes_per_thread=bytes_per_thread,
+                    columns=columns,
+                    use_python_file_object=use_python_file_object,
+                )
     assert_eq(expect, got2)
 
 
@@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-        )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_parquet(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got)
@@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            filters=filters,
-            open_file_options={"precache_options": {"method": precache}},
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                filters=filters,
+                open_file_options={"precache_options": {"method": precache}},
+            )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_orc(
-            f"s3://{bucket}/{fname}",
-            columns=columns,
-            storage_options=s3so,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                columns=columns,
+                storage_options=s3so,
+                use_python_file_object=use_python_file_object,
+            )
 
     if columns:
         expect = expect[columns]
@@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_orc(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 76c7f2bfdb8..80555750b3a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -6,6 +6,7 @@
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
+from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
@@ -15,6 +16,7 @@
 from pyarrow import PythonFile as ArrowPythonFile
 from pyarrow.lib import NativeFile
 
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -24,7 +26,6 @@
 except ImportError:
     fsspec_parquet = None
 
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -86,7 +87,7 @@
 1       20  rapids
 2       30      ai
 """.format(remote_data_sources=_docstring_remote_sources)
-doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)
+doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro)
 
 _docstring_read_parquet_metadata = """
 Read a Parquet file's metadata and schema
@@ -174,15 +175,23 @@
     columns are also loaded.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. Setting this argument to `False`
-    will require the entire file to be copied to host memory, and is highly
-    discouraged.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 open_file_options : dict, optional
     Dictionary of key-value pairs to pass to the function used to open remote
     files. By default, this will be `fsspec.parquet.open_parquet_file`. To
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
@@ -468,8 +477,12 @@
     If True, use row index if available for faster seeking.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger ORC files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -934,7 +947,7 @@
 --------
 cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
-doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
+doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf)
 
 _docstring_to_hdf = """
 Write the contained data to an HDF5 file using HDFStore.
@@ -1006,7 +1019,7 @@
 cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
-doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
+doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf)
 
 _docstring_read_feather = """
 Load an feather object from the file path, returning a DataFrame.
@@ -1188,8 +1201,12 @@
     the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger CSV files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1409,7 +1426,7 @@
 result : Series
 
 """
-doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource)
 
 
 _docstring_get_reader_filepath_or_buffer = """
@@ -1430,9 +1447,19 @@
 use_python_file_object : boolean, default False
     If True, Arrow-backed PythonFile objects will be used in place
     of fsspec AbstractBufferedFile objects.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers.
 open_file_options : dict, optional
     Optional dictionary of keyword arguments to pass to
     `_open_remote_files` (used for remote storage only).
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1708,7 +1735,8 @@ def get_reader_filepath_or_buffer(
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    use_python_file_object=False,
+    # no_default aliases to False
+    use_python_file_object=no_default,
     open_file_options=None,
     allow_raw_text_input=False,
     storage_options=None,
@@ -1720,6 +1748,30 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
+    if use_python_file_object is no_default:
+        use_python_file_object = False
+    elif use_python_file_object is not None:
+        warnings.warn(
+            "The 'use_python_file_object' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+    else:
+        # Preserve the readers (e.g. read_csv) default of True
+        # if no use_python_file_object option is specified by the user
+        # for now (note: this is different from the default for this
+        # function of False)
+        # TODO: when non-pyarrow file reading perf is good enough
+        # we can default this to False
+        use_python_file_object = True
+
+    if open_file_options is not None:
+        warnings.warn(
+            "The 'open_file_options' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7347ec7866a..c9b343e0f9f 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,6 +6,7 @@
 import os
 import traceback
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
+
+
+@contextmanager
+def maybe_filter_deprecation(
+    condition: bool, message: str, category: type[Warning]
+):
+    """Conditionally filter a warning category.
+
+    Parameters
+    ----------
+    condition
+        If true, filter the warning
+    message
+        Message to match, passed to :func:`warnings.filterwarnings`
+    category
+        Category of warning, passed to :func:`warnings.filterwarnings`
+    """
+    with warnings.catch_warnings():
+        if condition:
+            warnings.filterwarnings(
+                "ignore",
+                message,
+                category=category,
+            )
+        yield
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a67404da4fe..3947c69aaa5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             storage_options=s3so,
             open_file_options=open_file_options,
         )
-        assert df.a.sum().compute() == 10
-        assert df.b.sum().compute() == 9
+        with pytest.warns(FutureWarning):
+            assert df.a.sum().compute() == 10
+        with pytest.warns(FutureWarning):
+            assert df.b.sum().compute() == 9