Skip to content

Commit

Permalink
Fix naming issues with Index.to_frame and MultiIndex.to_frame APIs (
Browse files Browse the repository at this point in the history
#14105)

This PR:

- [x] Introduces `allow_duplicates` for parity with `MultiIndex.to_frame` - however this parameter is non-functional since cudf doesn't support duplicate column names.
- [x] Fixed handling of duplicate index names in `MultiIndex.to_frame`
- [x] Added proper docs for `Index.to_frame` & `MultiIndex.to_frame` separately due to change in API signature.
- [x] Added tests for `Index.to_frame` & `MultiIndex.to_frame`
- [x] Introduced deprecations that will go away when pandas-2.0 support is enabled.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #14105
  • Loading branch information
galipremsagar authored Sep 13, 2023
1 parent 1668c2c commit 60009a8
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 16 deletions.
57 changes: 51 additions & 6 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
drop_nulls,
)
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import (
is_bool_dtype,
is_integer,
Expand Down Expand Up @@ -701,21 +702,65 @@ def fillna(self, value, downcast=None):

return super().fillna(value=value)

def to_frame(self, index=True, name=None):
def to_frame(self, index=True, name=no_default):
"""Create a DataFrame with a column containing this Index
Parameters
----------
index : boolean, default True
Set the index of the returned DataFrame as the original Index
name : str, default None
Name to be used for the column
name : object, defaults to index.name
The passed name should substitute for the index name (if it has
one).
Returns
-------
DataFrame
cudf DataFrame
"""
if name is not None:
DataFrame containing the original Index data.
See Also
--------
Index.to_series : Convert an Index to a Series.
Series.to_frame : Convert Series to DataFrame.
Examples
--------
>>> import cudf
>>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
>>> idx.to_frame()
animal
animal
Ant Ant
Bear Bear
Cow Cow
By default, the original Index is reused. To enforce a new Index:
>>> idx.to_frame(index=False)
animal
0 Ant
1 Bear
2 Cow
To override the name of the resulting column, specify `name`:
>>> idx.to_frame(index=False, name='zoo')
zoo
0 Ant
1 Bear
2 Cow
"""
if name is None:
warnings.warn(
"Explicitly passing `name=None` currently preserves "
"the Index's name or uses a default name of 0. This "
"behaviour is deprecated, and in the future `None` "
"will be used as the name of the "
"resulting DataFrame column.",
FutureWarning,
)
name = no_default
if name is not no_default:
col_name = name
elif self.name is None:
col_name = 0
Expand Down
99 changes: 89 additions & 10 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import cudf
from cudf import _lib as libcudf
from cudf._typing import DataFrameOrSeries
from cudf.api.extensions import no_default
from cudf.api.types import is_integer, is_list_like, is_object_dtype
from cudf.core import column
from cudf.core._compat import PANDAS_GE_150
Expand Down Expand Up @@ -1015,7 +1016,12 @@ def __getitem__(self, index):
elif isinstance(index, slice):
start, stop, step = index.indices(len(self))
index = column.arange(start, stop, step)
result = MultiIndex.from_frame(self.to_frame(index=False).take(index))
result = MultiIndex.from_frame(
self.to_frame(index=False, name=range(0, self.nlevels)).take(
index
),
names=self.names,
)

# we are indexing into a single row of the MultiIndex,
# return that row as a tuple:
Expand All @@ -1026,24 +1032,95 @@ def __getitem__(self, index):
result._codes = self._codes.take(index)
if self._levels is not None:
result._levels = self._levels
result.names = self.names
return result

@_cudf_nvtx_annotate
def to_frame(self, index=True, name=None):
def to_frame(self, index=True, name=no_default, allow_duplicates=False):
"""
Create a DataFrame with the levels of the MultiIndex as columns.
Column ordering is determined by the DataFrame constructor with data as
a dict.
Parameters
----------
index : bool, default True
Set the index of the returned DataFrame as the original MultiIndex.
name : list / sequence of str, optional
The passed names should substitute index level names.
allow_duplicates : bool, optional default False
Allow duplicate column labels to be created. Note
that this parameter is non-functional because
duplicates column labels aren't supported in cudf.
Returns
-------
DataFrame
Examples
--------
>>> import cudf
>>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
>>> mi
MultiIndex([('a', 'c'),
('b', 'd')],
)
>>> df = mi.to_frame()
>>> df
0 1
a c a c
b d b d
>>> df = mi.to_frame(index=False)
>>> df
0 1
0 a c
1 b d
>>> df = mi.to_frame(name=['x', 'y'])
>>> df
x y
a c a c
b d b d
"""
# TODO: Currently this function makes a shallow copy, which is
# incorrect. We want to make a deep copy, otherwise further
# modifications of the resulting DataFrame will affect the MultiIndex.
df = cudf.DataFrame._from_data(data=self._data)
if index:
df = df.set_index(self)
if name is not None:
if name is None:
warnings.warn(
"Explicitly passing `name=None` currently preserves the "
"Index's name or uses a default name of 0. This behaviour "
"is deprecated, and in the future `None` will be used "
"as the name of the resulting DataFrame column.",
FutureWarning,
)
name = no_default

if name is not no_default:
if len(name) != len(self.levels):
raise ValueError(
"'name' should have the same length as "
"number of levels on index."
)
df.columns = name
column_names = name
else:
column_names = self.names
all_none_names = None
if not (
all_none_names := all(x is None for x in column_names)
) and len(column_names) != len(set(column_names)):
raise ValueError("Duplicate column names are not allowed")
df = cudf.DataFrame._from_data(
data=self._data,
columns=column_names
if name is not no_default and not all_none_names
else None,
)

if index:
df = df.set_index(self)

return df

@_cudf_nvtx_annotate
Expand Down Expand Up @@ -1504,7 +1581,9 @@ def droplevel(self, level=-1):

@_cudf_nvtx_annotate
def to_pandas(self, nullable=False, **kwargs):
result = self.to_frame(index=False).to_pandas(nullable=nullable)
result = self.to_frame(
index=False, name=list(range(self.nlevels))
).to_pandas(nullable=nullable)
return pd.MultiIndex.from_frame(result, names=self.names)

@classmethod
Expand Down Expand Up @@ -1623,7 +1702,7 @@ def _clean_nulls_from_index(self):
Convert all na values(if any) in MultiIndex object
to `<NA>` as a preprocessing step to `__repr__` methods.
"""
index_df = self.to_frame(index=False)
index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
return MultiIndex.from_frame(
index_df._clean_nulls_from_dataframe(index_df), names=self.names
)
Expand Down
19 changes: 19 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pytest

import cudf
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype
from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200
from cudf.core.index import (
Expand Down Expand Up @@ -2777,3 +2778,21 @@ def test_index_empty_from_pandas(request, dtype):
gidx = cudf.from_pandas(pidx)

assert_eq(pidx, gidx)


@pytest.mark.parametrize(
"data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
)
@pytest.mark.parametrize("data_name", [None, 1, "abc"])
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize("name", [None, no_default, 1, "abc"])
def test_index_to_frame(data, data_name, index, name):
pidx = pd.Index(data, name=data_name)
gidx = cudf.from_pandas(pidx)

with expect_warning_if(name is None):
expected = pidx.to_frame(index=index, name=name)
with expect_warning_if(name is None):
actual = gidx.to_frame(index=index, name=name)

assert_eq(expected, actual)
83 changes: 83 additions & 0 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pytest

import cudf
from cudf.api.extensions import no_default
from cudf.core._compat import PANDAS_GE_200
from cudf.core.column import as_column
from cudf.core.index import as_index
Expand Down Expand Up @@ -1926,3 +1927,85 @@ def test_multiindex_to_series_error():
midx = cudf.MultiIndex.from_tuples([("a", "b")])
with pytest.raises(NotImplementedError):
midx.to_series()


@pytest.mark.parametrize(
"pidx",
[
pd.MultiIndex.from_arrays(
[[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
names=["a", "b", "c"],
),
pd.MultiIndex.from_arrays(
[[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
names=["a", "a", "a"],
),
pd.MultiIndex.from_arrays(
[[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
),
],
)
@pytest.mark.parametrize(
"name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]]
)
@pytest.mark.parametrize("allow_duplicates", [True, False])
@pytest.mark.parametrize("index", [True, False])
def test_multiindex_to_frame_allow_duplicates(
pidx, name, allow_duplicates, index
):
gidx = cudf.from_pandas(pidx)

if (
(
len(pidx.names) != len(set(pidx.names))
and not all(x is None for x in pidx.names)
)
and not allow_duplicates
and (name is None or name is no_default)
):
assert_exceptions_equal(
pidx.to_frame,
gidx.to_frame,
lfunc_args_and_kwargs=(
[],
{
"index": index,
"name": name,
"allow_duplicates": allow_duplicates,
},
),
rfunc_args_and_kwargs=(
[],
{
"index": index,
"name": name,
"allow_duplicates": allow_duplicates,
},
),
)
else:
if (
len(pidx.names) != len(set(pidx.names))
and not all(x is None for x in pidx.names)
and not isinstance(name, list)
) or (isinstance(name, list) and len(name) != len(set(name))):
# cudf doesn't have the ability to construct dataframes
# with duplicate column names
with expect_warning_if(name is None):
with pytest.raises(ValueError):
gidx.to_frame(
index=index,
name=name,
allow_duplicates=allow_duplicates,
)
else:
with expect_warning_if(name is None):
expected = pidx.to_frame(
index=index, name=name, allow_duplicates=allow_duplicates
)
with expect_warning_if(name is None):
actual = gidx.to_frame(
index=index, name=name, allow_duplicates=allow_duplicates
)

assert_eq(expected, actual)

0 comments on commit 60009a8

Please sign in to comment.