diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 83c7f9c8635..70e293eb0e2 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -34,6 +34,7 @@ from cudf._lib.cpp.io.types cimport ( ) from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type, type_id +from cudf._lib.io.datasource cimport NativeFileDatasource from cudf._lib.io.utils cimport ( make_sink_info, make_source_info, @@ -53,6 +54,8 @@ from cudf._lib.utils cimport ( table_view_from_table, ) +from pyarrow.lib import NativeFile + from cudf._lib.utils import _index_level_name, generate_pandas_metadata from cudf.api.types import is_list_dtype, is_struct_dtype @@ -66,6 +69,10 @@ cpdef read_raw_orc_statistics(filepath_or_buffer): cudf.io.orc.read_orc_statistics """ + # Handle NativeFile input + if isinstance(filepath_or_buffer, NativeFile): + filepath_or_buffer = NativeFileDatasource(filepath_or_buffer) + cdef raw_orc_statistics raw = ( libcudf_read_raw_orc_statistics(make_source_info([filepath_or_buffer])) ) @@ -209,6 +216,9 @@ cdef orc_reader_options make_orc_reader_options( object decimal_cols_as_float ) except*: + for i, datasource in enumerate(filepaths_or_buffers): + if isinstance(datasource, NativeFile): + filepaths_or_buffers[i] = NativeFileDatasource(datasource) cdef vector[string] c_column_names cdef vector[vector[size_type]] strps = stripes c_column_names.reserve(len(column_names)) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 3aa672223c9..e1e38820d47 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -287,6 +287,7 @@ def read_orc( use_index=True, decimal_cols_as_float=None, timestamp_type=None, + use_python_file_object=True, **kwargs, ): """{docstring}""" @@ -321,7 +322,10 @@ def read_orc( source = fs.sep.join([source, "*.orc"]) tmp_source, compression = ioutils.get_filepath_or_buffer( - path_or_data=source, compression=None, **kwargs, + path_or_data=source, + compression=None, + use_python_file_object=use_python_file_object, + **kwargs, ) if compression is not None: raise ValueError( diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index added0887f1..2d7907a43b8 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -326,7 +326,9 @@ def test_read_json(s3_base, s3so): assert_eq(expect, got) -def test_read_orc(s3_base, s3so, datadir): +@pytest.mark.parametrize("use_python_file_object", [False, True]) +@pytest.mark.parametrize("columns", [None, ["string1"]]) +def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bname = "orc" @@ -337,9 +339,36 @@ def test_read_orc(s3_base, s3so, datadir): with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got = cudf.read_orc( - "s3://{}/{}".format(bname, fname), storage_options=s3so + "s3://{}/{}".format(bname, fname), + columns=columns, + storage_options=s3so, + use_python_file_object=use_python_file_object, ) + if columns: + expect = expect[columns] + assert_eq(expect, got) + + +@pytest.mark.parametrize("columns", [None, ["string1"]]) +def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns): + source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") + fname = "test_orc_reader.orc" + bname = "orc" + expect = pa.orc.ORCFile(source_file).read().to_pandas() + + with open(source_file, "rb") as f: + buffer = f.read() + + with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): + fs = pa_fs.S3FileSystem( + endpoint_override=s3so["client_kwargs"]["endpoint_url"], + ) + with fs.open_input_file("{}/{}".format(bname, fname)) as fil: + got = cudf.read_orc(fil, columns=columns) + + if columns: + expect = expect[columns] assert_eq(expect, got) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 02fcead42d6..628363ae6d3 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -357,6 +357,10 @@ decimal_cols_as_float: list, default None If specified, names of the columns that should be converted from Decimal to Float64 in the resulting dataframe. +use_python_file_object : boolean, default True + If True, Arrow-backed PythonFile objects will be used in place of fsspec + AbstractBufferedFile objects at IO time. This option is likely to improve + performance when making small reads from larger ORC files. kwargs are passed to the engine Returns