Skip to content

Commit e5753e3

Browse files
authored
Add Avro Reader options classes to pylibcudf (#17599)
Apart of #17565 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #17599
1 parent 0058b52 commit e5753e3

File tree

5 files changed

+173
-59
lines changed

5 files changed

+173
-59
lines changed

python/cudf/cudf/io/avro.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,18 @@ def read_avro(
3333
if not isinstance(skip_rows, int) or skip_rows < 0:
3434
raise TypeError("skip_rows must be an int >= 0")
3535

36-
plc_result = plc.io.avro.read_avro(
37-
plc.io.types.SourceInfo([filepath_or_buffer]),
38-
columns,
39-
skip_rows,
40-
num_rows,
36+
options = (
37+
plc.io.avro.AvroReaderOptions.builder(
38+
plc.io.types.SourceInfo([filepath_or_buffer])
39+
)
40+
.skip_rows(skip_rows)
41+
.num_rows(num_rows)
42+
.build()
4143
)
4244

45+
if columns is not None and len(columns) > 0:
46+
options.set_columns(columns)
47+
48+
plc_result = plc.io.avro.read_avro(options)
49+
4350
return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result))
Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,23 @@
11
# Copyright (c) 2024, NVIDIA CORPORATION.
22
from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
3-
from pylibcudf.libcudf.io.avro cimport avro_reader_options
3+
from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder
44
from pylibcudf.libcudf.types cimport size_type
55

66

7-
cpdef TableWithMetadata read_avro(
8-
SourceInfo source_info,
9-
list columns = *,
10-
size_type skip_rows = *,
11-
size_type num_rows = *
12-
)
7+
from pylibcudf.libcudf.types cimport size_type
8+
9+
cdef class AvroReaderOptions:
10+
cdef avro_reader_options c_obj
11+
cdef SourceInfo source
12+
cpdef void set_columns(self, list col_names)
13+
14+
15+
cdef class AvroReaderOptionsBuilder:
16+
cdef avro_reader_options_builder c_obj
17+
cdef SourceInfo source
18+
cpdef AvroReaderOptionsBuilder columns(self, list col_names)
19+
cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows)
20+
cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows)
21+
cpdef AvroReaderOptions build(self)
22+
23+
cpdef TableWithMetadata read_avro(AvroReaderOptions options)
Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
# Copyright (c) 2024, NVIDIA CORPORATION.
22
from pylibcudf.io.types import SourceInfo, TableWithMetadata
33

4-
__all__ = ["read_avro"]
5-
6-
def read_avro(
7-
source_info: SourceInfo,
8-
columns: list[str] | None = None,
9-
skip_rows: int = 0,
10-
num_rows: int = -1,
11-
) -> TableWithMetadata: ...
4+
__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]
5+
6+
class AvroReaderOptions:
7+
@staticmethod
8+
def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ...
9+
10+
class AvroReaderOptionsBuilder:
11+
def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ...
12+
def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ...
13+
def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ...
14+
def build(self) -> AvroReaderOptions: ...
15+
16+
def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ...

python/pylibcudf/pylibcudf/io/avro.pyx

Lines changed: 121 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport (
1010
)
1111
from pylibcudf.libcudf.types cimport size_type
1212

13-
__all__ = ["read_avro"]
13+
__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"]
14+
15+
16+
cdef class AvroReaderOptions:
17+
"""
18+
The settings to use for ``read_avro``
19+
For details, see :cpp:class:`cudf::io::avro_reader_options`
20+
"""
21+
@staticmethod
22+
def builder(SourceInfo source):
23+
"""
24+
Create a AvroWriterOptionsBuilder object
25+
26+
For details, see :cpp:func:`cudf::io::avro_reader_options::builder`
27+
28+
Parameters
29+
----------
30+
sink : SourceInfo
31+
The source to read the Avro file from.
32+
33+
Returns
34+
-------
35+
AvroReaderOptionsBuilder
36+
Builder to build AvroReaderOptions
37+
"""
38+
cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__(
39+
AvroReaderOptionsBuilder
40+
)
41+
avro_builder.c_obj = avro_reader_options.builder(source.c_obj)
42+
avro_builder.source = source
43+
return avro_builder
44+
45+
cpdef void set_columns(self, list col_names):
46+
"""
47+
Set names of the column to be read.
48+
49+
Parameters
50+
----------
51+
col_names : list[str]
52+
List of column names
53+
54+
Returns
55+
-------
56+
None
57+
"""
58+
cdef vector[string] vec
59+
vec.reserve(len(col_names))
60+
for name in col_names:
61+
vec.push_back(str(name).encode())
62+
self.c_obj.set_columns(vec)
63+
64+
65+
cdef class AvroReaderOptionsBuilder:
66+
cpdef AvroReaderOptionsBuilder columns(self, list col_names):
67+
"""
68+
Set names of the column to be read.
69+
70+
Parameters
71+
----------
72+
col_names : list
73+
List of column names
74+
75+
Returns
76+
-------
77+
AvroReaderOptionsBuilder
78+
"""
79+
cdef vector[string] vec
80+
vec.reserve(len(col_names))
81+
for name in col_names:
82+
vec.push_back(str(name).encode())
83+
self.c_obj.columns(vec)
84+
return self
85+
86+
cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows):
87+
"""
88+
Sets number of rows to skip.
89+
90+
Parameters
91+
----------
92+
skip_rows : size_type
93+
Number of rows to skip from start
94+
95+
Returns
96+
-------
97+
AvroReaderOptionsBuilder
98+
"""
99+
self.c_obj.skip_rows(skip_rows)
100+
return self
101+
102+
cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows):
103+
"""
104+
Sets number of rows to read.
105+
106+
Parameters
107+
----------
108+
num_rows : size_type
109+
Number of rows to read after skip
110+
111+
Returns
112+
-------
113+
AvroReaderOptionsBuilder
114+
"""
115+
self.c_obj.num_rows(num_rows)
116+
return self
117+
118+
cpdef AvroReaderOptions build(self):
119+
"""Create a AvroReaderOptions object"""
120+
cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__(
121+
AvroReaderOptions
122+
)
123+
avro_options.c_obj = move(self.c_obj.build())
124+
avro_options.source = self.source
125+
return avro_options
14126

15127

16128
cpdef TableWithMetadata read_avro(
17-
SourceInfo source_info,
18-
list columns = None,
19-
size_type skip_rows = 0,
20-
size_type num_rows = -1
129+
AvroReaderOptions options
21130
):
22131
"""
23-
Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
132+
Read from Avro format.
133+
134+
The source to read from and options are encapsulated
135+
by the `options` object.
24136
25137
For details, see :cpp:func:`read_avro`.
26138
27139
Parameters
28140
----------
29-
source_info: SourceInfo
30-
The SourceInfo object to read the avro dataset from.
31-
columns: list, default None
32-
Optional columns to read, if not provided, reads all columns in the file.
33-
skip_rows: size_type, default 0
34-
The number of rows to skip.
35-
num_rows: size_type, default -1
36-
The number of rows to read, after skipping rows.
37-
If -1 is passed, all rows will be read.
38-
39-
Returns
40-
-------
41-
TableWithMetadata
42-
The Table and its corresponding metadata (column names) that were read in.
141+
options: AvroReaderOptions
142+
Settings for controlling reading behavior
43143
"""
44-
cdef vector[string] c_columns
45-
if columns is not None and len(columns) > 0:
46-
c_columns.reserve(len(columns))
47-
for col in columns:
48-
c_columns.push_back(str(col).encode())
49-
50-
cdef avro_reader_options avro_opts = (
51-
avro_reader_options.builder(source_info.c_obj)
52-
.columns(c_columns)
53-
.skip_rows(skip_rows)
54-
.num_rows(num_rows)
55-
.build()
56-
)
57-
58144
with nogil:
59-
c_result = move(cpp_read_avro(avro_opts))
145+
c_result = move(cpp_read_avro(options.c_obj))
60146

61147
return TableWithMetadata.from_libcudf(c_result)

python/pylibcudf/pylibcudf/tests/io/test_avro.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
9898
buffer.seek(0)
9999

100100
res = plc.io.avro.read_avro(
101-
plc.io.types.SourceInfo([buffer]),
102-
columns=columns,
103-
skip_rows=skip_rows,
104-
num_rows=num_rows,
101+
(
102+
plc.io.avro.AvroReaderOptions.builder(
103+
plc.io.types.SourceInfo([buffer])
104+
)
105+
.columns(columns)
106+
.skip_rows(skip_rows)
107+
.num_rows(num_rows)
108+
.build()
109+
)
105110
)
106111

107112
expected = pa.Table.from_arrays(

0 commit comments

Comments
 (0)