Skip to content

Commit 469f226

Browse files
authored
Remove unused functionality in cudf._lib.utils.pyx (#17586)
Contributes to #17317 More can be removed once my other cudf._lib PRs are in Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: #17586
1 parent e9744b4 commit 469f226

File tree

4 files changed

+38
-337
lines changed

4 files changed

+38
-337
lines changed

python/cudf/cudf/_lib/utils.pxd

-16
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,6 @@
11
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
22

3-
from libcpp.memory cimport unique_ptr
4-
from libcpp.string cimport string
5-
from libcpp.vector cimport vector
6-
7-
from pylibcudf.libcudf.column.column cimport column_view
8-
from pylibcudf.libcudf.table.table cimport table, table_view
9-
10-
11-
cdef data_from_unique_ptr(
12-
unique_ptr[table] c_tbl, column_names, index_names=*)
133
cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
144
cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
15-
cdef data_from_table_view(
16-
table_view tv, object owner, object column_names, object index_names=*)
17-
cdef table_view table_view_from_columns(columns) except *
18-
cdef table_view table_view_from_table(tbl, ignore_index=*) except*
19-
cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
20-
cdef columns_from_table_view(table_view tv, object owners)
215
cpdef columns_from_pylibcudf_table(tbl)
226
cpdef _data_from_columns(columns, column_names, index_names=*)

python/cudf/cudf/_lib/utils.pyx

+1-308
Original file line numberDiff line numberDiff line change
@@ -1,233 +1,7 @@
11
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
2-
3-
import numpy as np
4-
import pyarrow as pa
5-
62
import cudf
73

8-
from cython.operator cimport dereference
9-
from libcpp.memory cimport unique_ptr
10-
from libcpp.utility cimport move
11-
from libcpp.vector cimport vector
12-
13-
from pylibcudf.libcudf.column.column cimport column, column_view
14-
from pylibcudf.libcudf.table.table cimport table
15-
from pylibcudf.libcudf.table.table_view cimport table_view
16-
from pylibcudf.libcudf.types cimport size_type
17-
184
from cudf._lib.column cimport Column
19-
from pylibcudf cimport Column as plc_Column
20-
try:
21-
import ujson as json
22-
except ImportError:
23-
import json
24-
25-
from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
26-
27-
PARQUET_META_TYPE_MAP = {
28-
str(cudf_dtype): str(pandas_dtype)
29-
for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
30-
}
31-
32-
cdef table_view table_view_from_columns(columns) except*:
33-
"""Create a cudf::table_view from an iterable of Columns."""
34-
cdef vector[column_view] column_views
35-
36-
cdef Column col
37-
for col in columns:
38-
column_views.push_back(col.view())
39-
40-
return table_view(column_views)
41-
42-
43-
cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
44-
"""Create a cudf::table_view from a Table.
45-
46-
Parameters
47-
----------
48-
ignore_index : bool, default False
49-
If True, don't include the index in the columns.
50-
"""
51-
return table_view_from_columns(
52-
tbl._index._columns + tbl._columns
53-
if not ignore_index and tbl._index is not None
54-
else tbl._columns
55-
)
56-
57-
58-
cpdef generate_pandas_metadata(table, index):
59-
col_names = []
60-
types = []
61-
index_levels = []
62-
index_descriptors = []
63-
columns_to_convert = list(table._columns)
64-
# Columns
65-
for name, col in table._column_labels_and_values:
66-
if cudf.get_option("mode.pandas_compatible"):
67-
# in pandas-compat mode, non-string column names are stringified.
68-
col_names.append(str(name))
69-
else:
70-
col_names.append(name)
71-
72-
if isinstance(col.dtype, cudf.CategoricalDtype):
73-
raise ValueError(
74-
"'category' column dtypes are currently not "
75-
+ "supported by the gpu accelerated parquet writer"
76-
)
77-
elif isinstance(col.dtype, (
78-
cudf.ListDtype,
79-
cudf.StructDtype,
80-
cudf.core.dtypes.DecimalDtype
81-
)):
82-
types.append(col.dtype.to_arrow())
83-
else:
84-
# A boolean element takes 8 bits in cudf and 1 bit in
85-
# pyarrow. To make sure the cudf format is interperable
86-
# in arrow, we use `int8` type when converting from a
87-
# cudf boolean array.
88-
if col.dtype.type == np.bool_:
89-
types.append(pa.int8())
90-
else:
91-
types.append(np_to_pa_dtype(col.dtype))
92-
93-
# Indexes
94-
materialize_index = False
95-
if index is not False:
96-
for level, name in enumerate(table._index.names):
97-
if isinstance(table._index, cudf.MultiIndex):
98-
idx = table.index.get_level_values(level)
99-
else:
100-
idx = table.index
101-
102-
if isinstance(idx, cudf.RangeIndex):
103-
if index is None:
104-
descr = {
105-
"kind": "range",
106-
"name": table.index.name,
107-
"start": table.index.start,
108-
"stop": table.index.stop,
109-
"step": table.index.step,
110-
}
111-
else:
112-
materialize_index = True
113-
# When `index=True`, RangeIndex needs to be materialized.
114-
materialized_idx = idx._as_int_index()
115-
descr = _index_level_name(
116-
index_name=materialized_idx.name,
117-
level=level,
118-
column_names=col_names
119-
)
120-
index_levels.append(materialized_idx)
121-
columns_to_convert.append(materialized_idx._values)
122-
col_names.append(descr)
123-
types.append(np_to_pa_dtype(materialized_idx.dtype))
124-
else:
125-
descr = _index_level_name(
126-
index_name=idx.name,
127-
level=level,
128-
column_names=col_names
129-
)
130-
columns_to_convert.append(idx._values)
131-
col_names.append(descr)
132-
if isinstance(idx.dtype, cudf.CategoricalDtype):
133-
raise ValueError(
134-
"'category' column dtypes are currently not "
135-
+ "supported by the gpu accelerated parquet writer"
136-
)
137-
elif isinstance(idx.dtype, cudf.ListDtype):
138-
types.append(col.dtype.to_arrow())
139-
else:
140-
# A boolean element takes 8 bits in cudf and 1 bit in
141-
# pyarrow. To make sure the cudf format is interperable
142-
# in arrow, we use `int8` type when converting from a
143-
# cudf boolean array.
144-
if idx.dtype.type == np.bool_:
145-
types.append(pa.int8())
146-
else:
147-
types.append(np_to_pa_dtype(idx.dtype))
148-
149-
index_levels.append(idx)
150-
index_descriptors.append(descr)
151-
152-
df_meta = table.head(0)
153-
if materialize_index:
154-
df_meta.index = df_meta.index._as_int_index()
155-
metadata = pa.pandas_compat.construct_metadata(
156-
columns_to_convert=columns_to_convert,
157-
# It is OKAY to do `.head(0).to_pandas()` because
158-
# this method will extract `.columns` metadata only
159-
df=df_meta.to_pandas(),
160-
column_names=col_names,
161-
index_levels=index_levels,
162-
index_descriptors=index_descriptors,
163-
preserve_index=index,
164-
types=types,
165-
)
166-
167-
md_dict = json.loads(metadata[b"pandas"])
168-
169-
# correct metadata for list and struct and nullable numeric types
170-
for col_meta in md_dict["columns"]:
171-
if (
172-
col_meta["name"] in table._column_names
173-
and table._data[col_meta["name"]].nullable
174-
and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
175-
and col_meta["pandas_type"] != "decimal"
176-
):
177-
col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
178-
col_meta["numpy_type"]
179-
]
180-
if col_meta["numpy_type"] in ("list", "struct"):
181-
col_meta["numpy_type"] = "object"
182-
183-
return json.dumps(md_dict)
184-
185-
186-
def _index_level_name(index_name, level, column_names):
187-
"""
188-
Return the name of an index level or a default name
189-
if `index_name` is None or is already a column name.
190-
191-
Parameters
192-
----------
193-
index_name : name of an Index object
194-
level : level of the Index object
195-
196-
Returns
197-
-------
198-
name : str
199-
"""
200-
if index_name is not None and index_name not in column_names:
201-
return index_name
202-
else:
203-
return f"__index_level_{level}__"
204-
205-
206-
cdef columns_from_unique_ptr(
207-
unique_ptr[table] c_tbl
208-
):
209-
"""Convert a libcudf table into list of columns.
210-
211-
Parameters
212-
----------
213-
c_tbl : unique_ptr[cudf::table]
214-
The libcudf table whose columns will be extracted
215-
216-
Returns
217-
-------
218-
list[Column]
219-
A list of columns.
220-
"""
221-
cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
222-
cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
223-
224-
cdef size_t i
225-
226-
return [
227-
Column.from_pylibcudf(
228-
plc_Column.from_libcudf(move(dereference(it+i)))
229-
) for i in range(c_columns.size())
230-
]
2315

2326

2337
cpdef columns_from_pylibcudf_table(tbl):
@@ -281,8 +55,7 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
28155
# the data while actually constructing the Index object here (instead
28256
# of just returning a dict for that as well). As we clean up the
28357
# Frame factories we may want to look for a less dissonant approach
284-
# that does not impose performance penalties. The same applies to
285-
# data_from_table_view below.
58+
# that does not impose performance penalties.
28659
cudf.core.index._index_from_data(
28760
{
28861
name: columns[i]
@@ -300,16 +73,6 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
30073
return data, index
30174

30275

303-
cdef data_from_unique_ptr(
304-
unique_ptr[table] c_tbl, column_names, index_names=None
305-
):
306-
return _data_from_columns(
307-
columns_from_unique_ptr(move(c_tbl)),
308-
column_names,
309-
index_names
310-
)
311-
312-
31376
cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
31477
return _data_from_columns(
31578
columns_from_pylibcudf_table(tbl),
@@ -329,73 +92,3 @@ cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None)
32992
column_names=column_names,
33093
index_names=index_names
33194
)
332-
333-
cdef columns_from_table_view(
334-
table_view tv,
335-
object owners,
336-
):
337-
"""
338-
Given a ``cudf::table_view``, constructs a list of columns from it,
339-
along with referencing an owner Python object that owns the memory
340-
lifetime. owner must be either None or a list of column. If owner
341-
is a list of columns, the owner of the `i`th ``cudf::column_view``
342-
in the table view is ``owners[i]``. For more about memory ownership,
343-
see ``Column.from_column_view``.
344-
"""
345-
346-
return [
347-
Column.from_column_view(
348-
tv.column(i), owners[i] if isinstance(owners, list) else None
349-
) for i in range(tv.num_columns())
350-
]
351-
352-
cdef data_from_table_view(
353-
table_view tv,
354-
object owner,
355-
object column_names,
356-
object index_names=None
357-
):
358-
"""
359-
Given a ``cudf::table_view``, constructs a Frame from it,
360-
along with referencing an ``owner`` Python object that owns the memory
361-
lifetime. If ``owner`` is a Frame we reach inside of it and
362-
reach inside of each ``cudf.Column`` to make the owner of each newly
363-
created ``Buffer`` underneath the ``cudf.Column`` objects of the
364-
created Frame the respective ``Buffer`` from the relevant
365-
``cudf.Column`` of the ``owner`` Frame
366-
"""
367-
cdef size_type column_idx = 0
368-
table_owner = isinstance(owner, cudf.core.frame.Frame)
369-
370-
# First construct the index, if any
371-
index = None
372-
if index_names is not None:
373-
index_columns = []
374-
for _ in index_names:
375-
column_owner = owner
376-
if table_owner:
377-
column_owner = owner._index._columns[column_idx]
378-
index_columns.append(
379-
Column.from_column_view(
380-
tv.column(column_idx),
381-
column_owner
382-
)
383-
)
384-
column_idx += 1
385-
index = cudf.core.index._index_from_data(
386-
dict(zip(index_names, index_columns)))
387-
388-
# Construct the data dict
389-
cdef size_type source_column_idx = 0
390-
data_columns = []
391-
for _ in column_names:
392-
column_owner = owner
393-
if table_owner:
394-
column_owner = owner._columns[source_column_idx]
395-
data_columns.append(
396-
Column.from_column_view(tv.column(column_idx), column_owner)
397-
)
398-
column_idx += 1
399-
source_column_idx += 1
400-
401-
return dict(zip(column_names, data_columns)), index

0 commit comments

Comments
 (0)