SNOW-1445867: Add support for DataFrame.pivot and pd.pivot (#1840)

Signed-off-by: Naren Krishna <[email protected]>
snowflakedb · Jun 28, 2024 · 29c4a76 · 29c4a76
1 parent a5c807b
commit 29c4a76
Show file tree

Hide file tree

Showing 18 changed files with 325 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -87,6 +87,7 @@
 - Added documentation pages for `Index` and its APIs.
 - Added support for `DataFrame.assign`.
 - Added support for `DataFrame.stack`.
+- Added support for `DataFrame.pivot` and `pd.pivot`.
 
 #### Bug Fixes
 

diff --git a/docs/source/modin/dataframe.rst b/docs/source/modin/dataframe.rst
@@ -186,6 +186,7 @@ DataFrame
     DataFrame.melt
     DataFrame.nlargest
     DataFrame.nsmallest
+    DataFrame.pivot
     DataFrame.pivot_table
     DataFrame.sort_index
     DataFrame.nlargest

diff --git a/docs/source/modin/general_functions.rst b/docs/source/modin/general_functions.rst
@@ -11,6 +11,7 @@ General functions
     :toctree: pandas_api/
 
     melt
+    pivot
     pivot_table
     cut
     qcut

diff --git a/docs/source/modin/supported/dataframe_supported.rst b/docs/source/modin/supported/dataframe_supported.rst
@@ -295,7 +295,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``pipe``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``pivot``                   | N                               |                                  |                                                    |
+| ``pivot``                   | P                               |                                  | See ``pivot_table``                                |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``pivot_table``             | P                               | ``observed``, ``sort``           | ``N`` if ``index``, ``columns``, or ``values`` is  |
 |                             |                                 |                                  | not str, list of str, or None; or MultiIndex; or   |

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -39,7 +39,7 @@ Data manipulations
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``merge_ordered``           | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``pivot``                   | N                               |                                  |                                                    |
+| ``pivot``                   | P                               |                                  | See ``pivot_table``                                |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``pivot_table``             | P                               | ``observed``, ``margins``,       | ``N`` if ``index``, ``columns``, or ``values`` is  |
 |                             |                                 | ``sort``                         | not str; or MultiIndex; or any ``argfunc`` is not  |

diff --git a/src/snowflake/snowpark/modin/pandas/dataframe.py b/src/snowflake/snowpark/modin/pandas/dataframe.py
@@ -1813,12 +1813,31 @@ def unstack(self, level=-1, fill_value=None):  # noqa: PR01, RT01, D200
                 query_compiler=self._query_compiler.unstack(level, fill_value)
             )
 
-    @dataframe_not_implemented()
-    def pivot(self, index=None, columns=None, values=None):  # noqa: PR01, RT01, D200
+    def pivot(
+        self,
+        *,
+        columns: Any,
+        index: Any | NoDefault = no_default,
+        values: Any | NoDefault = no_default,
+    ):
         """
-        Return reshaped ``DataFrame`` organized by given index / column values.
+        Return reshaped DataFrame organized by given index / column values.
         """
         # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
+        if index is no_default:
+            index = None  # pragma: no cover
+        if values is no_default:
+            values = None
+
+        # if values is not specified, it should be the remaining columns not in
+        # index or columns
+        if values is None:
+            values = list(self.columns)
+            if index is not None:
+                values = [v for v in values if v not in index]
+            if columns is not None:
+                values = [v for v in values if v not in columns]
+
         return self.__constructor__(
             query_compiler=self._query_compiler.pivot(
                 index=index, columns=columns, values=values

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
@@ -648,11 +648,100 @@ def pivot_table(
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
-@pandas_module_level_function_not_implemented()
-@_inherit_docstrings(pandas.pivot, apilink="pandas.pivot")
 def pivot(data, index=None, columns=None, values=None):  # noqa: PR01, RT01, D200
     """
     Return reshaped DataFrame organized by given index / column values.
+
+    Reshape data (produce a “pivot” table) based on column values. Uses unique values from
+    specified index / columns to form axes of the resulting DataFrame. This function does not
+    support data aggregation, multiple values will result in a MultiIndex in the columns.
+
+    Parameters
+    ----------
+    data : :class:`~snowflake.snowpark.modin.pandas.DataFrame`
+    columns : str or object or a list of str
+        Column to use to make new frame’s columns.
+    index : str or object or a list of str, optional
+        Column to use to make new frame’s index. If not given, uses existing index.
+    values : str, object or a list of the previous, optional
+        Column(s) to use for populating new frame’s values. If not specified, all remaining columns
+        will be used and the result will have hierarchically indexed columns.
+
+    Returns
+    -------
+    :class:`~snowflake.snowpark.modin.pandas.DataFrame`
+
+    Notes
+    -----
+    Calls pivot_table with columns, values, index and aggregation "min".
+
+    See Also
+    --------
+    DataFrame.pivot_table : Generalization of pivot that can handle
+        duplicate values for one index/column pair.
+    DataFrame.unstack: Pivot based on the index values instead
+        of a column.
+    wide_to_long : Wide panel to long format. Less flexible but more
+        user-friendly than melt.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
+    ...                   'two'],
+    ...           'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+    ...           'baz': [1, 2, 3, 4, 5, 6],
+    ...           'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+    >>> df
+       foo bar  baz zoo
+    0  one   A    1   x
+    1  one   B    2   y
+    2  one   C    3   z
+    3  two   A    4   q
+    4  two   B    5   w
+    5  two   C    6   t
+    >>> pd.pivot(data=df, index='foo', columns='bar', values='baz')  # doctest: +NORMALIZE_WHITESPACE
+    bar  A  B  C
+    foo
+    one  1  2  3
+    two  4  5  6
+    >>> pd.pivot(data=df, index='foo', columns='bar')['baz']  # doctest: +NORMALIZE_WHITESPACE
+    bar  A  B  C
+    foo
+    one  1  2  3
+    two  4  5  6
+    >>> pd.pivot(data=df, index='foo', columns='bar', values=['baz', 'zoo'])  # doctest: +NORMALIZE_WHITESPACE
+        baz       zoo
+    bar   A  B  C   A  B  C
+    foo
+    one   1  2  3   x  y  z
+    two   4  5  6   q  w  t
+    >>> df = pd.DataFrame({
+    ...     "lev1": [1, 1, 1, 2, 2, 2],
+    ...     "lev2": [1, 1, 2, 1, 1, 2],
+    ...     "lev3": [1, 2, 1, 2, 1, 2],
+    ...     "lev4": [1, 2, 3, 4, 5, 6],
+    ...     "values": [0, 1, 2, 3, 4, 5]})
+    >>> df
+       lev1  lev2  lev3  lev4  values
+    0     1     1     1     1       0
+    1     1     1     2     2       1
+    2     1     2     1     3       2
+    3     2     1     2     4       3
+    4     2     1     1     5       4
+    5     2     2     2     6       5
+    >>> pd.pivot(data=df, index="lev1", columns=["lev2", "lev3"], values="values")  # doctest: +NORMALIZE_WHITESPACE
+    lev2  1       2
+    lev3  1  2    1    2
+    lev1
+    1     0  1  2.0  NaN
+    2     4  3  NaN  5.0
+    >>> pd.pivot(data=df, index=["lev1", "lev2"], columns=["lev3"], values="values")  # doctest: +NORMALIZE_WHITESPACE
+    lev3         1    2
+    lev1 lev2
+    1    1     0.0  1.0
+         2     2.0  NaN
+    2    1     4.0  3.0
+         2     NaN  5.0
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
     if not isinstance(data, DataFrame):

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -6849,6 +6849,48 @@ def is_series_like(self) -> bool:
         # TODO SNOW-864083: look into why len(self.index) == 1 is also considered as series-like
         return self.get_axis_len(axis=1) == 1 or self.get_axis_len(axis=0) == 1
 
+    def pivot(
+        self,
+        columns: Any,
+        index: Optional[Any] = None,
+        values: Optional[Any] = None,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Return reshaped DataFrame organized by given index / column values.
+
+        Reshape data (produce a “pivot” table) based on column values. Uses unique values from
+        specified index / columns to form axes of the resulting DataFrame. This function does not
+        support data aggregation, multiple values will result in a MultiIndex in the columns.
+
+        Parameters
+        ----------
+        columns : str or object or a list of str
+            Column to use to make new frame’s columns.
+        index : str or object or a list of str, optional
+            Column to use to make new frame’s index. If not given, uses existing index.
+        values : str, object or a list of the previous, optional
+            Column(s) to use for populating new frame’s values. If not specified, all remaining columns
+            will be used and the result will have hierarchically indexed columns.
+
+        Returns
+        -------
+        SnowflakeQueryCompiler
+        """
+        # Call pivot_table which is a more generalized version of pivot with `min` aggregation
+        # Note we differ from pandas by not checking for duplicates and raising a ValueError as that would require an eager query
+        return self.pivot_table(
+            columns=columns,
+            index=index,
+            values=values,
+            aggfunc="min",
+            fill_value=None,
+            margins=False,
+            dropna=True,
+            margins_name="All",
+            observed=False,
+            sort=True,
+        )
+
     def pivot_table(
         self,
         index: Any,

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py
@@ -2185,7 +2185,97 @@ def unstack():
 
     def pivot():
         """
-        Return reshaped ``DataFrame`` organized by given index / column values.
+        Return reshaped DataFrame organized by given index / column values.
+
+        Reshape data (produce a "pivot" table) based on column values. Uses unique values from
+        specified index / columns to form axes of the resulting DataFrame. This function does not
+        support data aggregation, multiple values will result in a MultiIndex in the columns.
+
+        Parameters
+        ----------
+        columns : str or object or a list of str
+            Column to use to make new frame’s columns.
+        index : str or object or a list of str, optional
+            Column to use to make new frame’s index. If not given, uses existing index.
+        values : str, object or a list of the previous, optional
+            Column(s) to use for populating new frame’s values. If not specified, all remaining columns
+            will be used and the result will have hierarchically indexed columns.
+
+        Returns
+        -------
+        :class:`~snowflake.snowpark.modin.pandas.DataFrame`
+
+        Notes
+        -----
+        Calls pivot_table with columns, values, index and aggregation "min".
+
+        See Also
+        --------
+        DataFrame.pivot_table : Generalization of pivot that can handle
+            duplicate values for one index/column pair.
+        DataFrame.unstack: Pivot based on the index values instead
+            of a column.
+        wide_to_long : Wide panel to long format. Less flexible but more
+            user-friendly than melt.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
+        ...                   'two'],
+        ...           'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        ...           'baz': [1, 2, 3, 4, 5, 6],
+        ...           'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+        >>> df
+           foo bar  baz zoo
+        0  one   A    1   x
+        1  one   B    2   y
+        2  one   C    3   z
+        3  two   A    4   q
+        4  two   B    5   w
+        5  two   C    6   t
+        >>> df.pivot(index='foo', columns='bar', values='baz')  # doctest: +NORMALIZE_WHITESPACE
+        bar  A  B  C
+        foo
+        one  1  2  3
+        two  4  5  6
+        >>> df.pivot(index='foo', columns='bar')['baz']  # doctest: +NORMALIZE_WHITESPACE
+        bar  A  B  C
+        foo
+        one  1  2  3
+        two  4  5  6
+        >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])  # doctest: +NORMALIZE_WHITESPACE
+            baz       zoo
+        bar   A  B  C   A  B  C
+        foo
+        one   1  2  3   x  y  z
+        two   4  5  6   q  w  t
+        >>> df = pd.DataFrame({
+        ...     "lev1": [1, 1, 1, 2, 2, 2],
+        ...     "lev2": [1, 1, 2, 1, 1, 2],
+        ...     "lev3": [1, 2, 1, 2, 1, 2],
+        ...     "lev4": [1, 2, 3, 4, 5, 6],
+        ...     "values": [0, 1, 2, 3, 4, 5]})
+        >>> df
+           lev1  lev2  lev3  lev4  values
+        0     1     1     1     1       0
+        1     1     1     2     2       1
+        2     1     2     1     3       2
+        3     2     1     2     4       3
+        4     2     1     1     5       4
+        5     2     2     2     6       5
+        >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")  # doctest: +NORMALIZE_WHITESPACE
+        lev2  1       2
+        lev3  1  2    1    2
+        lev1
+        1     0  1  2.0  NaN
+        2     4  3  NaN  5.0
+        >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")  # doctest: +NORMALIZE_WHITESPACE
+        lev3         1    2
+        lev1 lev2
+        1    1     0.0  1.0
+             2     2.0  NaN
+        2    1     4.0  3.0
+             2     NaN  5.0
         """
 
     def pivot_table():

diff --git a/tests/integ/modin/pivot/conftest.py b/tests/integ/modin/pivot/conftest.py
@@ -373,3 +373,13 @@ def df_data_with_nulls_2():
         "E": [1, 2],
         "F": [None, 2],
     }
+
+
+@pytest.fixture(scope="module")
+def df_pivot_data():
+    return {
+        "foo": ["one", "one", "one", "two", "two", "two"],
+        "bar": ["A", "B", "C", "A", "B", "C"],
+        "baz": [1, 2, 3, 4, 5, 6],
+        "zoo": ["x", "y", "z", "q", "w", "t"],
+    }
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ General functions @@
         :toctree: pandas_api/
         melt
+        pivot
         pivot_table
         cut
         qcut
@@ Expand Down @@