Skip to content

Commit

Permalink
SNOW-1480718: Support Series.str.translate (#1776)
Browse files Browse the repository at this point in the history
  • Loading branch information
sfc-gh-joshi authored Jun 28, 2024
1 parent 9372edf commit a5c807b
Show file tree
Hide file tree
Showing 7 changed files with 265 additions and 5 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
- Added distributed tracing using open telemetry APIs for table stored procedure function in `DataFrame`:
- _execute_and_get_query_id

### Snowpark pandas API Updates

#### New Features
- Added partial support for `Series.str.translate` where the values in the `table` are single-codepoint strings.

## 1.19.0 (2024-06-25)

### Snowpark Python API Updates
Expand Down
1 change: 1 addition & 0 deletions docs/source/modin/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,4 +285,5 @@ Series
Series.str.split
Series.str.startswith
Series.str.strip
Series.str.translate
Series.str.upper
3 changes: 2 additions & 1 deletion docs/source/modin/supported/series_str_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ the method in the left column.
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``title`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``translate`` | N | |
| ``translate`` | P | ``N`` if any value in `table` has multiple |
| | | characters. |
+-----------------------------+---------------------------------+----------------------------------------------------+
| ``upper`` | Y | |
+-----------------------------+---------------------------------+----------------------------------------------------+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
timestamp_ntz_from_parts,
to_date,
to_variant,
translate,
trim,
uniform,
upper,
Expand Down Expand Up @@ -13979,8 +13980,66 @@ def str_rstrip(self, to_strip: Union[str, None] = None) -> "SnowflakeQueryCompil
def str_swapcase(self) -> None:
ErrorMessage.method_not_implemented_error("swapcase", "Series.str")

def str_translate(self, table: dict) -> None:
ErrorMessage.method_not_implemented_error("translate", "Series.str")
def str_translate(self, table: dict) -> "SnowflakeQueryCompiler":
"""
Map all characters in the string through the given mapping table.

Equivalent to standard :meth:`str.translate`.

Parameters
----------
table : dict
Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
None. Unmapped characters are left untouched.
Characters mapped to None are deleted. :meth:`str.maketrans` is a
helper function for making translation tables.

Returns
-------
SnowflakeQueryCompiler representing results of the string operation.
"""
# Snowflake SQL TRANSLATE:
# TRANSLATE(<subject>, <sourceAlphabet>, <targetAlphabet>)
# Characters in the <sourceAlphabet> string are mapped to the corresponding entry in <targetAlphabet>.
# If <sourceAlphabet> is longer than <targetAlphabet>, then the trailing characters of <sourceAlphabet>
# are removed from the input string.
#
# Because TRANSLATE only supports 1-to-1 character mappings, any entries with multi-character
# values must be handled by REPLACE instead. 1-character keys are always invalid.
single_char_pairs = {}
none_keys = set()
for key, value in table.items():
# Treat integers as unicode codepoints
if isinstance(key, int):
key = chr(key)
if isinstance(value, int):
value = chr(value)
if len(key) != 1:
# Mimic error from str.maketrans
raise ValueError(
f"Invalid mapping key '{key}'. String keys in translate table must be of length 1."
)
if value is not None and len(value) > 1:
raise NotImplementedError(
f"Invalid mapping value '{value}' for key '{key}'. Snowpark pandas currently only "
"supports unicode ordinals or 1-codepoint strings as values in str.translate mappings. "
"Consider using Series.str.replace to replace multiple characters."
)
if value is None or len(value) == 0:
none_keys.add(key)
else:
single_char_pairs[key] = value
source_alphabet = "".join(single_char_pairs.keys()) + "".join(none_keys)
target_alphabet = "".join(single_char_pairs.values())
return SnowflakeQueryCompiler(
self._modin_frame.apply_snowpark_function_to_data_columns(
lambda col_name: translate(
col(col_name),
pandas_lit(source_alphabet),
pandas_lit(target_alphabet),
)
)
)

def str_wrap(self, width: int, **kwargs: Any) -> None:
ErrorMessage.method_not_implemented_error("wrap", "Series.str")
Expand Down
69 changes: 68 additions & 1 deletion src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -960,7 +960,74 @@ def normalize():
pass

def translate():
pass
"""
Map all characters in the string through the given mapping table.
Equivalent to standard :meth:`str.translate`.
Parameters
----------
table : dict
Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
None. Unmapped characters are left untouched.
Characters mapped to None are deleted. :meth:`str.maketrans` is a
helper function for making translation tables.
Returns
-------
Series
Examples
--------
>>> ser = pd.Series(["El niño", "Françoise"])
>>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'})
>>> ser.str.translate(mytable) # doctest: +NORMALIZE_WHITESPACE
0 El nino
1 Francoise
dtype: object
Notes
-----
Snowpark pandas internally uses the Snowflake SQL `TRANSLATE` function to implement this
operation. Since this function uses strings instead of unicode codepoints, it will accept
mappings containing string keys that would be invalid in pandas.
The following example fails silently in vanilla pandas without `str.maketrans`:
>>> import pandas
>>> pandas.Series("aaa").str.translate({"a": "A"})
0 aaa
dtype: object
>>> pandas.Series("aaa").str.translate(str.maketrans({"a": "A"}))
0 AAA
dtype: object
The same code works in Snowpark pandas without `str.maketrans`:
>>> pd.Series("aaa").str.translate({"a": "A"})
0 AAA
dtype: object
>>> pd.Series("aaa").str.translate(str.maketrans({"a": "A"}))
0 AAA
dtype: object
Furthermore, due to restrictions in the underlying SQL, Snowpark pandas currently requires
all string values to be one unicode codepoint in length. To create replacements of multiple
characters, chain calls to `Series.str.replace` as needed.
Vanilla pandas code:
>>> import pandas
>>> pandas.Series("ab").str.translate(str.maketrans({"a": "A", "b": "BBB"}))
0 ABBB
dtype: object
Snowpark pandas equivalent:
>>> pd.Series("ab").str.translate({"a": "A"}).str.replace("b", "BBB")
0 ABBB
dtype: object
"""

def isalnum():
pass
Expand Down
128 changes: 128 additions & 0 deletions tests/integ/modin/strings/test_translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#

import modin.pandas as pd
import pandas as native_pd
import pytest

import snowflake.snowpark.modin.plugin # noqa: F401
from tests.integ.modin.sql_counter import sql_count_checker
from tests.integ.modin.utils import (
assert_snowpark_pandas_equal_to_pandas,
create_test_series,
eval_snowpark_pandas_result,
)


@pytest.mark.parametrize(
"data, table",
[
(
# Simple 1-element mapping
["aaaaa", "bbbaaa", "cafdsaf;lh"],
str.maketrans("a", "b"),
),
(
# Mapping with mixed str, unicode code points, and Nones
["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"],
str.maketrans(
{ord("a"): "A", ord("f"): None, "y": "z", "k": None, ord("j"): ""}
),
),
(
# Mapping with special characters
[
"Peña",
"Ordoñez",
"Raúl",
"Ibañez",
"François",
"øen",
"2πr = τ",
"München",
],
str.maketrans(
{
"ñ": "n",
"ú": "u",
"ç": "c",
"ø": "o",
"τ": "t",
"π": "p",
"ü": "u",
}
),
),
(
# Mapping with compound emojis. Each item in the series renders as a single emoji,
# but is actually 4 characters. Calling `len` on each element correctly returns 4.
# https://unicode.org/emoji/charts/emoji-zwj-sequences.html
# Inputs:
# - "head shaking horizontally" = 1F642 + 200D + 2194 + FE0F
# - "heart on fire" = 2764 + FE0F + 200D + 1F525
# - "judge" = 1F9D1 + 200D + 2696 + FE0F
# Outputs:
# - "head shaking vertically" = 1F642 + 200D + 2195 + FE0F
# - "mending heart" = 2764 + FE0F + 200D + 1FA79
# - "health worker" = 1F91D1 + 200D + 2695 + FE0F
["🙂‍↔️", "❤️‍🔥", "🧑‍⚖️"],
{
0x2194: 0x2195,
0x1F525: 0x1FA79,
0x2696: 0x2695,
},
),
],
)
@sql_count_checker(query_count=1)
def test_translate(data, table):
eval_snowpark_pandas_result(
*create_test_series(data), lambda ser: ser.str.translate(table)
)


@sql_count_checker(query_count=1)
def test_translate_without_maketrans():
# pandas requires all table keys to be unicode ordinal values, and does not know how to handle
# string keys that were not converted to ordinals via `ord` or `str.maketrans`. Since Snowflake
# SQL uses strings in its mappings, we accept string keys as well as ordinals.
data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"]
table = {ord("a"): "A", ord("f"): None, "y": "z", "k": None}
snow_ser = pd.Series(data)
assert_snowpark_pandas_equal_to_pandas(
snow_ser.str.translate(table),
native_pd.Series(data).str.translate(str.maketrans(table)),
)
# Mappings for "y" and "k" are ignored if not passed through str.maketrans because they are
# not unicode ordinals
assert (
not native_pd.Series(data)
.str.translate(table)
.equals(native_pd.Series(data).str.translate(str.maketrans(table)))
)


@pytest.mark.parametrize(
"table, error",
[
({"😶‍🌫️": "a"}, ValueError), # This emoji key is secretly 4 code points
({"aa": "a"}, ValueError), # Key is 2 chars
# Mapping 1 char to multiple is valid in vanilla pandas, but we don't support this
(
{ord("a"): "😶‍🌫️"},
NotImplementedError,
), # This emoji value is secretly 4 code points
({ord("a"): "aa"}, NotImplementedError), # Value is 2 chars
],
)
@sql_count_checker(query_count=0)
def test_translate_invalid_mappings(table, error):
data = ["aaaaa", "fjkdsajk", "cjghgjqk", "yubikey"]
# native pandas silently treats all of these cases as no-ops. However, since Snowflake SQL uses
# strings as mappings instead of a dict construct, passing these arguments to the equivalent
# SQL argument would either cause an inscrutable error or unexpected changes to the output series.
snow_ser, native_ser = create_test_series(data)
native_ser.str.translate(table)
with pytest.raises(error):
snow_ser.str.translate(table)
1 change: 0 additions & 1 deletion tests/unit/modin/test_series_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def test_str_cat_no_others(mock_str_register, mock_series):
(lambda s: s.str.rindex("abc", start=1), "rindex"),
(lambda s: s.str.swapcase(), "swapcase"),
(lambda s: s.str.normalize("NFC"), "normalize"),
(lambda s: s.str.translate(str.maketrans("a", "b")), "translate"),
(lambda s: s.str.isalnum(), "isalnum"),
(lambda s: s.str.isalpha(), "isalpha"),
(lambda s: s.str.isnumeric(), "isnumeric"),
Expand Down

0 comments on commit a5c807b

Please sign in to comment.