Skip to content

Commit

Permalink
SNOW-1445867: Add support for DataFrame.pivot and pd.pivot (#1840)
Browse files Browse the repository at this point in the history
Signed-off-by: Naren Krishna <[email protected]>
  • Loading branch information
sfc-gh-nkrishna authored Jun 28, 2024
1 parent a5c807b commit 29c4a76
Show file tree
Hide file tree
Showing 18 changed files with 325 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
- Added documentation pages for `Index` and its APIs.
- Added support for `DataFrame.assign`.
- Added support for `DataFrame.stack`.
- Added support for `DataFrame.pivot` and `pd.pivot`.

#### Bug Fixes

Expand Down
1 change: 1 addition & 0 deletions docs/source/modin/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ DataFrame
DataFrame.melt
DataFrame.nlargest
DataFrame.nsmallest
DataFrame.pivot
DataFrame.pivot_table
DataFrame.sort_index
DataFrame.nlargest
Expand Down
1 change: 1 addition & 0 deletions docs/source/modin/general_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General functions
:toctree: pandas_api/

melt
pivot
pivot_table
cut
qcut
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``pipe`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``pivot`` | N | | |
| ``pivot`` | P | | See ``pivot_table`` |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``pivot_table`` | P | ``observed``, ``sort`` | ``N`` if ``index``, ``columns``, or ``values`` is |
| | | | not str, list of str, or None; or MultiIndex; or |
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/general_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Data manipulations
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``merge_ordered`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``pivot`` | N | | |
| ``pivot`` | P | | See ``pivot_table`` |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``pivot_table`` | P | ``observed``, ``margins``, | ``N`` if ``index``, ``columns``, or ``values`` is |
| | | ``sort`` | not str; or MultiIndex; or any ``argfunc`` is not |
Expand Down
25 changes: 22 additions & 3 deletions src/snowflake/snowpark/modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1813,12 +1813,31 @@ def unstack(self, level=-1, fill_value=None): # noqa: PR01, RT01, D200
query_compiler=self._query_compiler.unstack(level, fill_value)
)

@dataframe_not_implemented()
def pivot(self, index=None, columns=None, values=None): # noqa: PR01, RT01, D200
def pivot(
self,
*,
columns: Any,
index: Any | NoDefault = no_default,
values: Any | NoDefault = no_default,
):
"""
Return reshaped ``DataFrame`` organized by given index / column values.
Return reshaped DataFrame organized by given index / column values.
"""
# TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
if index is no_default:
index = None # pragma: no cover
if values is no_default:
values = None

# if values is not specified, it should be the remaining columns not in
# index or columns
if values is None:
values = list(self.columns)
if index is not None:
values = [v for v in values if v not in index]
if columns is not None:
values = [v for v in values if v not in columns]

return self.__constructor__(
query_compiler=self._query_compiler.pivot(
index=index, columns=columns, values=values
Expand Down
93 changes: 91 additions & 2 deletions src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,11 +648,100 @@ def pivot_table(


@snowpark_pandas_telemetry_standalone_function_decorator
@pandas_module_level_function_not_implemented()
@_inherit_docstrings(pandas.pivot, apilink="pandas.pivot")
def pivot(data, index=None, columns=None, values=None): # noqa: PR01, RT01, D200
"""
Return reshaped DataFrame organized by given index / column values.
Reshape data (produce a “pivot” table) based on column values. Uses unique values from
specified index / columns to form axes of the resulting DataFrame. This function does not
support data aggregation, multiple values will result in a MultiIndex in the columns.
Parameters
----------
data : :class:`~snowflake.snowpark.modin.pandas.DataFrame`
columns : str or object or a list of str
Column to use to make new frame’s columns.
index : str or object or a list of str, optional
Column to use to make new frame’s index. If not given, uses existing index.
values : str, object or a list of the previous, optional
Column(s) to use for populating new frame’s values. If not specified, all remaining columns
will be used and the result will have hierarchically indexed columns.
Returns
-------
:class:`~snowflake.snowpark.modin.pandas.DataFrame`
Notes
-----
Calls pivot_table with columns, values, index and aggregation "min".
See Also
--------
DataFrame.pivot_table : Generalization of pivot that can handle
duplicate values for one index/column pair.
DataFrame.unstack: Pivot based on the index values instead
of a column.
wide_to_long : Wide panel to long format. Less flexible but more
user-friendly than melt.
Examples
--------
>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
... 'two'],
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
... 'baz': [1, 2, 3, 4, 5, 6],
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
>>> df
foo bar baz zoo
0 one A 1 x
1 one B 2 y
2 one C 3 z
3 two A 4 q
4 two B 5 w
5 two C 6 t
>>> pd.pivot(data=df, index='foo', columns='bar', values='baz') # doctest: +NORMALIZE_WHITESPACE
bar A B C
foo
one 1 2 3
two 4 5 6
>>> pd.pivot(data=df, index='foo', columns='bar')['baz'] # doctest: +NORMALIZE_WHITESPACE
bar A B C
foo
one 1 2 3
two 4 5 6
>>> pd.pivot(data=df, index='foo', columns='bar', values=['baz', 'zoo']) # doctest: +NORMALIZE_WHITESPACE
baz zoo
bar A B C A B C
foo
one 1 2 3 x y z
two 4 5 6 q w t
>>> df = pd.DataFrame({
... "lev1": [1, 1, 1, 2, 2, 2],
... "lev2": [1, 1, 2, 1, 1, 2],
... "lev3": [1, 2, 1, 2, 1, 2],
... "lev4": [1, 2, 3, 4, 5, 6],
... "values": [0, 1, 2, 3, 4, 5]})
>>> df
lev1 lev2 lev3 lev4 values
0 1 1 1 1 0
1 1 1 2 2 1
2 1 2 1 3 2
3 2 1 2 4 3
4 2 1 1 5 4
5 2 2 2 6 5
>>> pd.pivot(data=df, index="lev1", columns=["lev2", "lev3"], values="values") # doctest: +NORMALIZE_WHITESPACE
lev2 1 2
lev3 1 2 1 2
lev1
1 0 1 2.0 NaN
2 4 3 NaN 5.0
>>> pd.pivot(data=df, index=["lev1", "lev2"], columns=["lev3"], values="values") # doctest: +NORMALIZE_WHITESPACE
lev3 1 2
lev1 lev2
1 1 0.0 1.0
2 2.0 NaN
2 1 4.0 3.0
2 NaN 5.0
"""
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
if not isinstance(data, DataFrame):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6849,6 +6849,48 @@ def is_series_like(self) -> bool:
# TODO SNOW-864083: look into why len(self.index) == 1 is also considered as series-like
return self.get_axis_len(axis=1) == 1 or self.get_axis_len(axis=0) == 1

def pivot(
self,
columns: Any,
index: Optional[Any] = None,
values: Optional[Any] = None,
) -> "SnowflakeQueryCompiler":
"""
Return reshaped DataFrame organized by given index / column values.

Reshape data (produce a “pivot” table) based on column values. Uses unique values from
specified index / columns to form axes of the resulting DataFrame. This function does not
support data aggregation, multiple values will result in a MultiIndex in the columns.

Parameters
----------
columns : str or object or a list of str
Column to use to make new frame’s columns.
index : str or object or a list of str, optional
Column to use to make new frame’s index. If not given, uses existing index.
values : str, object or a list of the previous, optional
Column(s) to use for populating new frame’s values. If not specified, all remaining columns
will be used and the result will have hierarchically indexed columns.

Returns
-------
SnowflakeQueryCompiler
"""
# Call pivot_table which is a more generalized version of pivot with `min` aggregation
# Note we differ from pandas by not checking for duplicates and raising a ValueError as that would require an eager query
return self.pivot_table(
columns=columns,
index=index,
values=values,
aggfunc="min",
fill_value=None,
margins=False,
dropna=True,
margins_name="All",
observed=False,
sort=True,
)

def pivot_table(
self,
index: Any,
Expand Down
92 changes: 91 additions & 1 deletion src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2185,7 +2185,97 @@ def unstack():

def pivot():
"""
Return reshaped ``DataFrame`` organized by given index / column values.
Return reshaped DataFrame organized by given index / column values.
Reshape data (produce a "pivot" table) based on column values. Uses unique values from
specified index / columns to form axes of the resulting DataFrame. This function does not
support data aggregation, multiple values will result in a MultiIndex in the columns.
Parameters
----------
columns : str or object or a list of str
Column to use to make new frame’s columns.
index : str or object or a list of str, optional
Column to use to make new frame’s index. If not given, uses existing index.
values : str, object or a list of the previous, optional
Column(s) to use for populating new frame’s values. If not specified, all remaining columns
will be used and the result will have hierarchically indexed columns.
Returns
-------
:class:`~snowflake.snowpark.modin.pandas.DataFrame`
Notes
-----
Calls pivot_table with columns, values, index and aggregation "min".
See Also
--------
DataFrame.pivot_table : Generalization of pivot that can handle
duplicate values for one index/column pair.
DataFrame.unstack: Pivot based on the index values instead
of a column.
wide_to_long : Wide panel to long format. Less flexible but more
user-friendly than melt.
Examples
--------
>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
... 'two'],
... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
... 'baz': [1, 2, 3, 4, 5, 6],
... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
>>> df
foo bar baz zoo
0 one A 1 x
1 one B 2 y
2 one C 3 z
3 two A 4 q
4 two B 5 w
5 two C 6 t
>>> df.pivot(index='foo', columns='bar', values='baz') # doctest: +NORMALIZE_WHITESPACE
bar A B C
foo
one 1 2 3
two 4 5 6
>>> df.pivot(index='foo', columns='bar')['baz'] # doctest: +NORMALIZE_WHITESPACE
bar A B C
foo
one 1 2 3
two 4 5 6
>>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) # doctest: +NORMALIZE_WHITESPACE
baz zoo
bar A B C A B C
foo
one 1 2 3 x y z
two 4 5 6 q w t
>>> df = pd.DataFrame({
... "lev1": [1, 1, 1, 2, 2, 2],
... "lev2": [1, 1, 2, 1, 1, 2],
... "lev3": [1, 2, 1, 2, 1, 2],
... "lev4": [1, 2, 3, 4, 5, 6],
... "values": [0, 1, 2, 3, 4, 5]})
>>> df
lev1 lev2 lev3 lev4 values
0 1 1 1 1 0
1 1 1 2 2 1
2 1 2 1 3 2
3 2 1 2 4 3
4 2 1 1 5 4
5 2 2 2 6 5
>>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") # doctest: +NORMALIZE_WHITESPACE
lev2 1 2
lev3 1 2 1 2
lev1
1 0 1 2.0 NaN
2 4 3 NaN 5.0
>>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") # doctest: +NORMALIZE_WHITESPACE
lev3 1 2
lev1 lev2
1 1 0.0 1.0
2 2.0 NaN
2 1 4.0 3.0
2 NaN 5.0
"""

def pivot_table():
Expand Down
10 changes: 10 additions & 0 deletions tests/integ/modin/pivot/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,13 @@ def df_data_with_nulls_2():
"E": [1, 2],
"F": [None, 2],
}


@pytest.fixture(scope="module")
def df_pivot_data():
return {
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
Loading

0 comments on commit 29c4a76

Please sign in to comment.