SNOW-1480718: Support Series.str.translate (#1776)

sfc-gh-joshi · web-flow · commit a5c807b8a533 · 2024-06-27T18:45:31.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,11 @@
 - Added distributed tracing using open telemetry APIs for table stored procedure function in `DataFrame`:
   - _execute_and_get_query_id
 
+### Snowpark pandas API Updates
+
+#### New Features
+- Added partial support for `Series.str.translate` where the values in the `table` are single-codepoint strings.
+
 ## 1.19.0 (2024-06-25)
 
 ### Snowpark Python API Updates
diff --git a/docs/source/modin/series.rst b/docs/source/modin/series.rst
@@ -285,4 +285,5 @@ Series
     Series.str.split
     Series.str.startswith
     Series.str.strip
+    Series.str.translate
     Series.str.upper
diff --git a/docs/source/modin/supported/series_str_supported.rst b/docs/source/modin/supported/series_str_supported.rst
@@ -129,7 +129,8 @@ the method in the left column.
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``title``                   | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``translate``               | N                               |                                                    |
+| ``translate``               | P                               | ``N`` if any value in `table` has multiple         |
+|                             |                                 | characters.                                        |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``upper``                   | Y                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -125,6 +125,7 @@
     timestamp_ntz_from_parts,
     to_date,
     to_variant,
+    translate,
     trim,
     uniform,
     upper,
@@ -13979,8 +13980,66 @@ def str_rstrip(self, to_strip: Union[str, None] = None) -> "SnowflakeQueryCompil
     def str_swapcase(self) -> None:
         ErrorMessage.method_not_implemented_error("swapcase", "Series.str")
 
-    def str_translate(self, table: dict) -> None:
-        ErrorMessage.method_not_implemented_error("translate", "Series.str")
+    def str_translate(self, table: dict) -> "SnowflakeQueryCompiler":
+        """
+        Map all characters in the string through the given mapping table.
+
+        Equivalent to standard :meth:`str.translate`.
+
+        Parameters
+        ----------
+        table : dict
+            Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
+            None. Unmapped characters are left untouched.
+            Characters mapped to None are deleted. :meth:`str.maketrans` is a
+            helper function for making translation tables.
+
+        Returns
+        -------
+        SnowflakeQueryCompiler representing results of the string operation.
+        """
+        # Snowflake SQL TRANSLATE:
+        #   TRANSLATE(<subject>, <sourceAlphabet>, <targetAlphabet>)
+        # Characters in the <sourceAlphabet> string are mapped to the corresponding entry in <targetAlphabet>.
+        # If <sourceAlphabet> is longer than <targetAlphabet>, then the trailing characters of <sourceAlphabet>
+        # are removed from the input string.
+        #
+        # Because TRANSLATE only supports 1-to-1 character mappings, any entries with multi-character
+        # values must be handled by REPLACE instead. 1-character keys are always invalid.
+        single_char_pairs = {}
+        none_keys = set()
+        for key, value in table.items():
+            # Treat integers as unicode codepoints
+            if isinstance(key, int):
+                key = chr(key)
+            if isinstance(value, int):
+                value = chr(value)
+            if len(key) != 1:
+                # Mimic error from str.maketrans
+                raise ValueError(
+                    f"Invalid mapping key '{key}'. String keys in translate table must be of length 1."
+                )
+            if value is not None and len(value) > 1:
+                raise NotImplementedError(
+                    f"Invalid mapping value '{value}' for key '{key}'. Snowpark pandas currently only "
+                    "supports unicode ordinals or 1-codepoint strings as values in str.translate mappings. "
+                    "Consider using Series.str.replace to replace multiple characters."
+                )
+            if value is None or len(value) == 0:
+                none_keys.add(key)
+            else:
+                single_char_pairs[key] = value
+        source_alphabet = "".join(single_char_pairs.keys()) + "".join(none_keys)
+        target_alphabet = "".join(single_char_pairs.values())
+        return SnowflakeQueryCompiler(
+            self._modin_frame.apply_snowpark_function_to_data_columns(
+                lambda col_name: translate(
+                    col(col_name),
+                    pandas_lit(source_alphabet),
+                    pandas_lit(target_alphabet),
+                )
+            )
+        )
 
     def str_wrap(self, width: int, **kwargs: Any) -> None:
         ErrorMessage.method_not_implemented_error("wrap", "Series.str")
diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
@@ -960,7 +960,74 @@ def normalize():
         pass
 
     def translate():
-        pass
+        """
+        Map all characters in the string through the given mapping table.
+
+        Equivalent to standard :meth:`str.translate`.
+
+        Parameters
+        ----------
+        table : dict
+            Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
+            None. Unmapped characters are left untouched.
+            Characters mapped to None are deleted. :meth:`str.maketrans` is a
+            helper function for making translation tables.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> ser = pd.Series(["El niño", "Françoise"])
+        >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'})
+        >>> ser.str.translate(mytable)  # doctest: +NORMALIZE_WHITESPACE
+        0   El nino
+        1   Francoise
+        dtype: object
+
+        Notes
+        -----
+        Snowpark pandas internally uses the Snowflake SQL `TRANSLATE` function to implement this
+        operation. Since this function uses strings instead of unicode codepoints, it will accept
+        mappings containing string keys that would be invalid in pandas.
+
+        The following example fails silently in vanilla pandas without `str.maketrans`:
+
+        >>> import pandas
+        >>> pandas.Series("aaa").str.translate({"a": "A"})
+        0    aaa
+        dtype: object
+        >>> pandas.Series("aaa").str.translate(str.maketrans({"a": "A"}))
+        0    AAA
+        dtype: object
+
+        The same code works in Snowpark pandas without `str.maketrans`:
+
+        >>> pd.Series("aaa").str.translate({"a": "A"})
+        0    AAA
+        dtype: object
+        >>> pd.Series("aaa").str.translate(str.maketrans({"a": "A"}))
+        0    AAA
+        dtype: object
+
+        Furthermore, due to restrictions in the underlying SQL, Snowpark pandas currently requires
+        all string values to be one unicode codepoint in length. To create replacements of multiple
+        characters, chain calls to `Series.str.replace` as needed.
+
+        Vanilla pandas code:
+
+        >>> import pandas
+        >>> pandas.Series("ab").str.translate(str.maketrans({"a": "A", "b": "BBB"}))
+        0    ABBB
+        dtype: object
+
+        Snowpark pandas equivalent:
+
+        >>> pd.Series("ab").str.translate({"a": "A"}).str.replace("b", "BBB")
+        0    ABBB
+        dtype: object
+        """
 
     def isalnum():
         pass
diff --git a/tests/integ/modin/strings/test_translate.py b/tests/integ/modin/strings/test_translate.py
@@ -0,0 +1,128 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+
+import modin.pandas as pd
+import pandas as native_pd
+import pytest
+
+import snowflake.snowpark.modin.plugin  # noqa: F401
+from tests.integ.modin.sql_counter import sql_count_checker
+from tests.integ.modin.utils import (
+    assert_snowpark_pandas_equal_to_pandas,
+    create_test_series,
+    eval_snowpark_pandas_result,
+)
+
+
+@pytest.mark.parametrize(
+    "data, table",
+    [
+        (
+            # Simple 1-element mapping
+            ["aaaaa", "bbbaaa", "cafdsaf;lh"],
+            str.maketrans("a", "b"),
+        ),
+        (
+            # Mapping with mixed str, unicode code points, and Nones
+            ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"],
+            str.maketrans(
+                {ord("a"): "A", ord("f"): None, "y": "z", "k": None, ord("j"): ""}
+            ),
+        ),
+        (
+            # Mapping with special characters
+            [
+                "Peña",
+                "Ordoñez",
+                "Raúl",
+                "Ibañez",
+                "François",
+                "øen",
+                "2πr = τ",
+                "München",
+            ],
+            str.maketrans(
+                {
+                    "ñ": "n",
+                    "ú": "u",
+                    "ç": "c",
+                    "ø": "o",
+                    "τ": "t",
+                    "π": "p",
+                    "ü": "u",
+                }
+            ),
+        ),
+        (
+            # Mapping with compound emojis. Each item in the series renders as a single emoji,
+            # but is actually 4 characters. Calling `len` on each element correctly returns 4.
+            # https://unicode.org/emoji/charts/emoji-zwj-sequences.html
+            # Inputs:
+            # - "head shaking horizontally" = 1F642 + 200D + 2194 + FE0F
+            # - "heart on fire" = 2764 + FE0F + 200D + 1F525
+            # - "judge" = 1F9D1 + 200D + 2696 + FE0F
+            # Outputs:
+            # - "head shaking vertically" = 1F642 + 200D + 2195 + FE0F
+            # - "mending heart" = 2764 + FE0F + 200D + 1FA79
+            # - "health worker" = 1F91D1 + 200D + 2695 + FE0F
+            ["🙂‍↔️", "❤️‍🔥", "🧑‍⚖️"],
+            {
+                0x2194: 0x2195,
+                0x1F525: 0x1FA79,
+                0x2696: 0x2695,
+            },
+        ),
+    ],
+)
+@sql_count_checker(query_count=1)
+def test_translate(data, table):
+    eval_snowpark_pandas_result(
+        *create_test_series(data), lambda ser: ser.str.translate(table)
+    )
+
+
+@sql_count_checker(query_count=1)
+def test_translate_without_maketrans():
+    # pandas requires all table keys to be unicode ordinal values, and does not know how to handle
+    # string keys that were not converted to ordinals via `ord` or `str.maketrans`. Since Snowflake
+    # SQL uses strings in its mappings, we accept string keys as well as ordinals.
+    data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"]
+    table = {ord("a"): "A", ord("f"): None, "y": "z", "k": None}
+    snow_ser = pd.Series(data)
+    assert_snowpark_pandas_equal_to_pandas(
+        snow_ser.str.translate(table),
+        native_pd.Series(data).str.translate(str.maketrans(table)),
+    )
+    # Mappings for "y" and "k" are ignored if not passed through str.maketrans because they are
+    # not unicode ordinals
+    assert (
+        not native_pd.Series(data)
+        .str.translate(table)
+        .equals(native_pd.Series(data).str.translate(str.maketrans(table)))
+    )
+
+
+@pytest.mark.parametrize(
+    "table, error",
+    [
+        ({"😶‍🌫️": "a"}, ValueError),  # This emoji key is secretly 4 code points
+        ({"aa": "a"}, ValueError),  # Key is 2 chars
+        # Mapping 1 char to multiple is valid in vanilla pandas, but we don't support this
+        (
+            {ord("a"): "😶‍🌫️"},
+            NotImplementedError,
+        ),  # This emoji value is secretly 4 code points
+        ({ord("a"): "aa"}, NotImplementedError),  # Value is 2 chars
+    ],
+)
+@sql_count_checker(query_count=0)
+def test_translate_invalid_mappings(table, error):
+    data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"]
+    # native pandas silently treats all of these cases as no-ops. However, since Snowflake SQL uses
+    # strings as mappings instead of a dict construct, passing these arguments to the equivalent
+    # SQL argument would either cause an inscrutable error or unexpected changes to the output series.
+    snow_ser, native_ser = create_test_series(data)
+    native_ser.str.translate(table)
+    with pytest.raises(error):
+        snow_ser.str.translate(table)
diff --git a/tests/unit/modin/test_series_strings.py b/tests/unit/modin/test_series_strings.py
@@ -56,7 +56,6 @@ def test_str_cat_no_others(mock_str_register, mock_series):
         (lambda s: s.str.rindex("abc", start=1), "rindex"),
         (lambda s: s.str.swapcase(), "swapcase"),
         (lambda s: s.str.normalize("NFC"), "normalize"),
-        (lambda s: s.str.translate(str.maketrans("a", "b")), "translate"),
         (lambda s: s.str.isalnum(), "isalnum"),
         (lambda s: s.str.isalpha(), "isalpha"),
         (lambda s: s.str.isnumeric(), "isnumeric"),