Skip to content

Commit 535db9b

Browse files
authored
Deprecate Arrow support in I/O (#16132)
Contributes to #15193 Authors: - Thomas Li (https://github.com/lithomas1) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Lawrence Mitchell (https://github.com/wence-) URL: #16132
1 parent e169e8e commit 535db9b

File tree

12 files changed

+247
-132
lines changed

12 files changed

+247
-132
lines changed

python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile
77
from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
88
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
99

10+
import warnings
11+
1012

1113
cdef class Datasource:
1214
cdef datasource* get_datasource(self) except * nogil:
@@ -16,10 +18,16 @@ cdef class Datasource:
1618

1719
cdef class NativeFileDatasource(Datasource):
1820

19-
def __cinit__(self, NativeFile native_file,):
21+
def __cinit__(self, NativeFile native_file):
2022

2123
cdef shared_ptr[CRandomAccessFile] ra_src
2224

25+
warnings.warn(
26+
"Support for reading pyarrow's NativeFile is deprecated "
27+
"and will be removed in a future release of cudf.",
28+
FutureWarning,
29+
)
30+
2331
ra_src = native_file.get_random_access_file()
2432
self.c_datasource.reset(new arrow_io_source(ra_src))
2533

python/cudf/cudf/io/csv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def read_csv(
5050
comment=None,
5151
delim_whitespace=False,
5252
byte_range=None,
53-
use_python_file_object=True,
53+
use_python_file_object=None,
5454
storage_options=None,
5555
bytes_per_thread=None,
5656
):

python/cudf/cudf/io/orc.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from cudf._lib import orc as liborc
1111
from cudf.api.types import is_list_like
1212
from cudf.utils import ioutils
13+
from cudf.utils.utils import maybe_filter_deprecation
1314

1415

1516
def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
280281
num_rows=None,
281282
use_index=True,
282283
timestamp_type=None,
283-
use_python_file_object=True,
284+
use_python_file_object=None,
284285
storage_options=None,
285286
bytes_per_thread=None,
286287
):
@@ -320,6 +321,9 @@ def read_orc(
320321
)
321322

322323
filepaths_or_buffers = []
324+
have_nativefile = any(
325+
isinstance(source, pa.NativeFile) for source in filepath_or_buffer
326+
)
323327
for source in filepath_or_buffer:
324328
if ioutils.is_directory(
325329
path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
360364
stripes = selected_stripes
361365

362366
if engine == "cudf":
363-
return DataFrame._from_data(
364-
*liborc.read_orc(
365-
filepaths_or_buffers,
366-
columns,
367-
stripes,
368-
skiprows,
369-
num_rows,
370-
use_index,
371-
timestamp_type,
367+
# Don't want to warn if use_python_file_object causes us to get
368+
# a NativeFile (there is a separate deprecation warning for that)
369+
with maybe_filter_deprecation(
370+
not have_nativefile,
371+
message="Support for reading pyarrow's NativeFile is deprecated",
372+
category=FutureWarning,
373+
):
374+
return DataFrame._from_data(
375+
*liborc.read_orc(
376+
filepaths_or_buffers,
377+
columns,
378+
stripes,
379+
skiprows,
380+
num_rows,
381+
use_index,
382+
timestamp_type,
383+
)
372384
)
373-
)
374385
else:
375386
from pyarrow import orc
376387

python/cudf/cudf/io/parquet.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import numpy as np
1717
import pandas as pd
18+
import pyarrow as pa
1819
from pyarrow import dataset as ds
1920

2021
import cudf
@@ -23,6 +24,7 @@
2324
from cudf.core.column import as_column, build_categorical_column, column_empty
2425
from cudf.utils import ioutils
2526
from cudf.utils.performance_tracking import _performance_tracking
27+
from cudf.utils.utils import maybe_filter_deprecation
2628

2729
BYTE_SIZES = {
2830
"kb": 1000,
@@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
350352
path_or_data=source,
351353
compression=None,
352354
fs=fs,
353-
use_python_file_object=True,
355+
use_python_file_object=None,
354356
open_file_options=None,
355357
storage_options=None,
356358
bytes_per_thread=None,
@@ -532,7 +534,7 @@ def read_parquet(
532534
filters=None,
533535
row_groups=None,
534536
use_pandas_metadata=True,
535-
use_python_file_object=True,
537+
use_python_file_object=None,
536538
categorical_partitions=True,
537539
open_file_options=None,
538540
bytes_per_thread=None,
@@ -615,6 +617,9 @@ def read_parquet(
615617
row_groups=row_groups,
616618
fs=fs,
617619
)
620+
have_nativefile = any(
621+
isinstance(source, pa.NativeFile) for source in filepath_or_buffer
622+
)
618623
for source in filepath_or_buffer:
619624
tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
620625
path_or_data=source,
@@ -662,19 +667,26 @@ def read_parquet(
662667
)
663668

664669
# Convert parquet data to a cudf.DataFrame
665-
df = _parquet_to_frame(
666-
filepaths_or_buffers,
667-
engine,
668-
*args,
669-
columns=columns,
670-
row_groups=row_groups,
671-
use_pandas_metadata=use_pandas_metadata,
672-
partition_keys=partition_keys,
673-
partition_categories=partition_categories,
674-
dataset_kwargs=dataset_kwargs,
675-
**kwargs,
676-
)
677670

671+
# Don't want to warn if use_python_file_object causes us to get
672+
# a NativeFile (there is a separate deprecation warning for that)
673+
with maybe_filter_deprecation(
674+
not have_nativefile,
675+
message="Support for reading pyarrow's NativeFile is deprecated",
676+
category=FutureWarning,
677+
):
678+
df = _parquet_to_frame(
679+
filepaths_or_buffers,
680+
engine,
681+
*args,
682+
columns=columns,
683+
row_groups=row_groups,
684+
use_pandas_metadata=use_pandas_metadata,
685+
partition_keys=partition_keys,
686+
partition_categories=partition_categories,
687+
dataset_kwargs=dataset_kwargs,
688+
**kwargs,
689+
)
678690
# Apply filters row-wise (if any are defined), and return
679691
df = _apply_post_filters(df, filters)
680692
if projected_columns:

python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,9 @@
22

33
import io
44

5-
import pyarrow as pa
65
import pytest
76

87
import cudf._lib.pylibcudf as plc
9-
from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
108

119

1210
@pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink):
1816
"""
1917
Skip invalid sinks for SinkInfo
2018
"""
21-
if io_class is plc.io.SinkInfo and isinstance(
22-
sink, (bytes, NativeFileDatasource)
23-
):
24-
pytest.skip(f"{sink} is not a valid input for SinkInfo")
19+
if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
20+
pytest.skip("bytes is not a valid input for SinkInfo")
2521

2622

2723
@pytest.mark.parametrize(
@@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink):
3026
"a.txt",
3127
b"hello world",
3228
io.BytesIO(b"hello world"),
33-
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
3429
],
3530
)
3631
def test_source_info_ctor(io_class, source, tmp_path):
@@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
4742
@pytest.mark.parametrize(
4843
"sources",
4944
[
45+
["a.txt"],
46+
[b"hello world"],
47+
[io.BytesIO(b"hello world")],
5048
["a.txt", "a.txt"],
5149
[b"hello world", b"hello there"],
5250
[io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
53-
[
54-
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
55-
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
56-
],
5751
],
5852
)
5953
def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
7973
io.BytesIO(b"hello there"),
8074
b"hello world",
8175
],
82-
[
83-
NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
84-
"awef.txt",
85-
b"hello world",
86-
],
8776
],
8877
)
8978
def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):

python/cudf/cudf/tests/test_csv.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
10851085
# Arrow FileSystem interface
10861086
expect = cudf.read_csv(path_or_buf("filepath"))
10871087
fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
1088-
with fs.open_input_file(path) as fil:
1089-
got = cudf.read_csv(fil)
1088+
with pytest.warns(FutureWarning):
1089+
with fs.open_input_file(path) as fil:
1090+
got = cudf.read_csv(fil)
10901091

10911092
assert_eq(expect, got)
10921093

python/cudf/cudf/tests/test_gcs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def mock_size(*args):
4646
# use_python_file_object=True, because the pyarrow
4747
# `open_input_file` command will fail (since it doesn't
4848
# use the monkey-patched `open` definition)
49-
got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
49+
with pytest.warns(FutureWarning):
50+
got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
5051
assert_eq(pdf, got)
5152

5253
# AbstractBufferedFile -> PythonFile conversion

python/cudf/cudf/tests/test_parquet.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
711711
expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
712712
fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
713713
with fs.open_input_file(path) as fil:
714-
got = cudf.read_parquet(fil)
714+
with pytest.warns(FutureWarning):
715+
got = cudf.read_parquet(fil)
715716

716717
assert_eq(expect, got)
717718

@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
726727
fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
727728

728729
# Pass open fsspec file
729-
with fs.open(paths[0], mode="rb") as fil:
730-
got1 = cudf.read_parquet(
731-
fil, use_python_file_object=use_python_file_object
732-
)
730+
with pytest.warns(FutureWarning):
731+
with fs.open(paths[0], mode="rb") as fil:
732+
got1 = cudf.read_parquet(
733+
fil, use_python_file_object=use_python_file_object
734+
)
733735
assert_eq(expect, got1)
734736

735737
# Pass path only
736-
got2 = cudf.read_parquet(
737-
paths[0], use_python_file_object=use_python_file_object
738-
)
738+
with pytest.warns(FutureWarning):
739+
got2 = cudf.read_parquet(
740+
paths[0], use_python_file_object=use_python_file_object
741+
)
739742
assert_eq(expect, got2)
740743

741744

0 commit comments

Comments
 (0)