1
1
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
2
-
3
- import numpy as np
4
- import pyarrow as pa
5
-
6
2
import cudf
7
3
8
- from cython.operator cimport dereference
9
- from libcpp.memory cimport unique_ptr
10
- from libcpp.utility cimport move
11
- from libcpp.vector cimport vector
12
-
13
- from pylibcudf.libcudf.column.column cimport column, column_view
14
- from pylibcudf.libcudf.table.table cimport table
15
- from pylibcudf.libcudf.table.table_view cimport table_view
16
- from pylibcudf.libcudf.types cimport size_type
17
-
18
4
from cudf._lib.column cimport Column
19
- from pylibcudf cimport Column as plc_Column
20
- try :
21
- import ujson as json
22
- except ImportError :
23
- import json
24
-
25
- from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
26
-
27
- PARQUET_META_TYPE_MAP = {
28
- str (cudf_dtype): str (pandas_dtype)
29
- for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
30
- }
31
-
32
- cdef table_view table_view_from_columns(columns) except * :
33
- """ Create a cudf::table_view from an iterable of Columns."""
34
- cdef vector[column_view] column_views
35
-
36
- cdef Column col
37
- for col in columns:
38
- column_views.push_back(col.view())
39
-
40
- return table_view(column_views)
41
-
42
-
43
- cdef table_view table_view_from_table(tbl, ignore_index = False ) except * :
44
- """ Create a cudf::table_view from a Table.
45
-
46
- Parameters
47
- ----------
48
- ignore_index : bool, default False
49
- If True, don't include the index in the columns.
50
- """
51
- return table_view_from_columns(
52
- tbl._index._columns + tbl._columns
53
- if not ignore_index and tbl._index is not None
54
- else tbl._columns
55
- )
56
-
57
-
58
- cpdef generate_pandas_metadata(table, index):
59
- col_names = []
60
- types = []
61
- index_levels = []
62
- index_descriptors = []
63
- columns_to_convert = list (table._columns)
64
- # Columns
65
- for name, col in table._column_labels_and_values:
66
- if cudf.get_option(" mode.pandas_compatible" ):
67
- # in pandas-compat mode, non-string column names are stringified.
68
- col_names.append(str (name))
69
- else :
70
- col_names.append(name)
71
-
72
- if isinstance (col.dtype, cudf.CategoricalDtype):
73
- raise ValueError (
74
- " 'category' column dtypes are currently not "
75
- + " supported by the gpu accelerated parquet writer"
76
- )
77
- elif isinstance (col.dtype, (
78
- cudf.ListDtype,
79
- cudf.StructDtype,
80
- cudf.core.dtypes.DecimalDtype
81
- )):
82
- types.append(col.dtype.to_arrow())
83
- else :
84
- # A boolean element takes 8 bits in cudf and 1 bit in
85
- # pyarrow. To make sure the cudf format is interperable
86
- # in arrow, we use `int8` type when converting from a
87
- # cudf boolean array.
88
- if col.dtype.type == np.bool_:
89
- types.append(pa.int8())
90
- else :
91
- types.append(np_to_pa_dtype(col.dtype))
92
-
93
- # Indexes
94
- materialize_index = False
95
- if index is not False :
96
- for level, name in enumerate (table._index.names):
97
- if isinstance (table._index, cudf.MultiIndex):
98
- idx = table.index.get_level_values(level)
99
- else :
100
- idx = table.index
101
-
102
- if isinstance (idx, cudf.RangeIndex):
103
- if index is None :
104
- descr = {
105
- " kind" : " range" ,
106
- " name" : table.index.name,
107
- " start" : table.index.start,
108
- " stop" : table.index.stop,
109
- " step" : table.index.step,
110
- }
111
- else :
112
- materialize_index = True
113
- # When `index=True`, RangeIndex needs to be materialized.
114
- materialized_idx = idx._as_int_index()
115
- descr = _index_level_name(
116
- index_name = materialized_idx.name,
117
- level = level,
118
- column_names = col_names
119
- )
120
- index_levels.append(materialized_idx)
121
- columns_to_convert.append(materialized_idx._values)
122
- col_names.append(descr)
123
- types.append(np_to_pa_dtype(materialized_idx.dtype))
124
- else :
125
- descr = _index_level_name(
126
- index_name = idx.name,
127
- level = level,
128
- column_names = col_names
129
- )
130
- columns_to_convert.append(idx._values)
131
- col_names.append(descr)
132
- if isinstance (idx.dtype, cudf.CategoricalDtype):
133
- raise ValueError (
134
- " 'category' column dtypes are currently not "
135
- + " supported by the gpu accelerated parquet writer"
136
- )
137
- elif isinstance (idx.dtype, cudf.ListDtype):
138
- types.append(col.dtype.to_arrow())
139
- else :
140
- # A boolean element takes 8 bits in cudf and 1 bit in
141
- # pyarrow. To make sure the cudf format is interperable
142
- # in arrow, we use `int8` type when converting from a
143
- # cudf boolean array.
144
- if idx.dtype.type == np.bool_:
145
- types.append(pa.int8())
146
- else :
147
- types.append(np_to_pa_dtype(idx.dtype))
148
-
149
- index_levels.append(idx)
150
- index_descriptors.append(descr)
151
-
152
- df_meta = table.head(0 )
153
- if materialize_index:
154
- df_meta.index = df_meta.index._as_int_index()
155
- metadata = pa.pandas_compat.construct_metadata(
156
- columns_to_convert = columns_to_convert,
157
- # It is OKAY to do `.head(0).to_pandas()` because
158
- # this method will extract `.columns` metadata only
159
- df = df_meta.to_pandas(),
160
- column_names = col_names,
161
- index_levels = index_levels,
162
- index_descriptors = index_descriptors,
163
- preserve_index = index,
164
- types = types,
165
- )
166
-
167
- md_dict = json.loads(metadata[b" pandas" ])
168
-
169
- # correct metadata for list and struct and nullable numeric types
170
- for col_meta in md_dict[" columns" ]:
171
- if (
172
- col_meta[" name" ] in table._column_names
173
- and table._data[col_meta[" name" ]].nullable
174
- and col_meta[" numpy_type" ] in PARQUET_META_TYPE_MAP
175
- and col_meta[" pandas_type" ] != " decimal"
176
- ):
177
- col_meta[" numpy_type" ] = PARQUET_META_TYPE_MAP[
178
- col_meta[" numpy_type" ]
179
- ]
180
- if col_meta[" numpy_type" ] in (" list" , " struct" ):
181
- col_meta[" numpy_type" ] = " object"
182
-
183
- return json.dumps(md_dict)
184
-
185
-
186
- def _index_level_name (index_name , level , column_names ):
187
- """
188
- Return the name of an index level or a default name
189
- if `index_name` is None or is already a column name.
190
-
191
- Parameters
192
- ----------
193
- index_name : name of an Index object
194
- level : level of the Index object
195
-
196
- Returns
197
- -------
198
- name : str
199
- """
200
- if index_name is not None and index_name not in column_names:
201
- return index_name
202
- else :
203
- return f" __index_level_{level}__"
204
-
205
-
206
- cdef columns_from_unique_ptr(
207
- unique_ptr[table] c_tbl
208
- ):
209
- """ Convert a libcudf table into list of columns.
210
-
211
- Parameters
212
- ----------
213
- c_tbl : unique_ptr[cudf::table]
214
- The libcudf table whose columns will be extracted
215
-
216
- Returns
217
- -------
218
- list[Column]
219
- A list of columns.
220
- """
221
- cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
222
- cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
223
-
224
- cdef size_t i
225
-
226
- return [
227
- Column.from_pylibcudf(
228
- plc_Column.from_libcudf(move(dereference(it+ i)))
229
- ) for i in range (c_columns.size())
230
- ]
231
5
232
6
233
7
cpdef columns_from_pylibcudf_table(tbl):
@@ -281,8 +55,7 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
281
55
# the data while actually constructing the Index object here (instead
282
56
# of just returning a dict for that as well). As we clean up the
283
57
# Frame factories we may want to look for a less dissonant approach
284
- # that does not impose performance penalties. The same applies to
285
- # data_from_table_view below.
58
+ # that does not impose performance penalties.
286
59
cudf.core.index._index_from_data(
287
60
{
288
61
name: columns[i]
@@ -300,16 +73,6 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
300
73
return data, index
301
74
302
75
303
- cdef data_from_unique_ptr(
304
- unique_ptr[table] c_tbl, column_names, index_names = None
305
- ):
306
- return _data_from_columns(
307
- columns_from_unique_ptr(move(c_tbl)),
308
- column_names,
309
- index_names
310
- )
311
-
312
-
313
76
cpdef data_from_pylibcudf_table(tbl, column_names, index_names = None ):
314
77
return _data_from_columns(
315
78
columns_from_pylibcudf_table(tbl),
@@ -329,73 +92,3 @@ cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None)
329
92
column_names = column_names,
330
93
index_names = index_names
331
94
)
332
-
333
- cdef columns_from_table_view(
334
- table_view tv,
335
- object owners,
336
- ):
337
- """
338
- Given a ``cudf::table_view``, constructs a list of columns from it,
339
- along with referencing an owner Python object that owns the memory
340
- lifetime. owner must be either None or a list of column. If owner
341
- is a list of columns, the owner of the `i`th ``cudf::column_view``
342
- in the table view is ``owners[i]``. For more about memory ownership,
343
- see ``Column.from_column_view``.
344
- """
345
-
346
- return [
347
- Column.from_column_view(
348
- tv.column(i), owners[i] if isinstance (owners, list ) else None
349
- ) for i in range (tv.num_columns())
350
- ]
351
-
352
- cdef data_from_table_view(
353
- table_view tv,
354
- object owner,
355
- object column_names,
356
- object index_names = None
357
- ):
358
- """
359
- Given a ``cudf::table_view``, constructs a Frame from it,
360
- along with referencing an ``owner`` Python object that owns the memory
361
- lifetime. If ``owner`` is a Frame we reach inside of it and
362
- reach inside of each ``cudf.Column`` to make the owner of each newly
363
- created ``Buffer`` underneath the ``cudf.Column`` objects of the
364
- created Frame the respective ``Buffer`` from the relevant
365
- ``cudf.Column`` of the ``owner`` Frame
366
- """
367
- cdef size_type column_idx = 0
368
- table_owner = isinstance (owner, cudf.core.frame.Frame)
369
-
370
- # First construct the index, if any
371
- index = None
372
- if index_names is not None :
373
- index_columns = []
374
- for _ in index_names:
375
- column_owner = owner
376
- if table_owner:
377
- column_owner = owner._index._columns[column_idx]
378
- index_columns.append(
379
- Column.from_column_view(
380
- tv.column(column_idx),
381
- column_owner
382
- )
383
- )
384
- column_idx += 1
385
- index = cudf.core.index._index_from_data(
386
- dict (zip (index_names, index_columns)))
387
-
388
- # Construct the data dict
389
- cdef size_type source_column_idx = 0
390
- data_columns = []
391
- for _ in column_names:
392
- column_owner = owner
393
- if table_owner:
394
- column_owner = owner._columns[source_column_idx]
395
- data_columns.append(
396
- Column.from_column_view(tv.column(column_idx), column_owner)
397
- )
398
- column_idx += 1
399
- source_column_idx += 1
400
-
401
- return dict (zip (column_names, data_columns)), index
0 commit comments