Skip to content

Commit

Permalink
SNOW-1620436 Improved to handle all local input cases and improved ti…
Browse files Browse the repository at this point in the history
…me series notebook tests
  • Loading branch information
sfc-gh-azhan committed Aug 29, 2024
1 parent 98723ca commit 0c28a0c
Show file tree
Hide file tree
Showing 8 changed files with 5,038 additions and 348 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
#### Improvements

- Refactored `quoted_identifier_to_snowflake_type` to avoid making metadata queries if the types have been cached locally.
- Improved `pd.to_datetime` to handle all local input cases.

#### Bug Fixes

Expand Down
55 changes: 29 additions & 26 deletions src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -1742,16 +1742,13 @@ def to_datetime(
The default behaviour (``utc=False``) is as follows:
- Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
- Timezone-naive inputs are kept as timezone-naive :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex`:
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
>>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
- Timezone-aware inputs *with constant time offset* are still converted to
timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
>>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
- Use right format to convert to timezone-aware type (Note that when call Snowpark
pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
Expand All @@ -1763,17 +1760,17 @@ def to_datetime(
issued from a timezone with daylight savings, such as Europe/Paris):
>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]')
>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]')
Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
- Timezone-naive inputs are *localized* based on the session timezone
>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
DatetimeIndex(['2018-10-26 05:00:00-07:00', '2018-10-26 06:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
- Timezone-aware inputs are *converted* to session timezone
Expand All @@ -1784,8 +1781,28 @@ def to_datetime(
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
raise_if_native_pandas_objects(arg)

if arg is None:
return None # same as pandas
if not isinstance(arg, (DataFrame, Series, pd.Index)):
# use pandas.to_datetime to convert local data to datetime
res = pandas.to_datetime(
arg,
errors,
dayfirst,
yearfirst,
utc,
format,
exact,
unit,
infer_datetime_format,
origin,
cache,
)
if isinstance(res, pandas.Series):
res = pd.Series(res)
elif not is_scalar(res):
res = pd.Index(res)
return res

# handle modin objs
if unit and unit not in VALID_TO_DATETIME_UNIT:
raise ValueError(f"Unrecognized unit {unit}")

Expand All @@ -1795,15 +1812,8 @@ def to_datetime(
argument="cache",
message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
)
arg_is_scalar = is_scalar(arg)

if not isinstance(arg, (DataFrame, Series, pd.Index)):
# Turn dictionary like arg into pd.DataFrame and list-like or scalar to
# pd.Index.
arg = [arg] if arg_is_scalar else arg
arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)

series_or_index = arg._to_datetime(
return arg._to_datetime(
errors=errors,
dayfirst=dayfirst,
yearfirst=yearfirst,
Expand All @@ -1814,13 +1824,6 @@ def to_datetime(
infer_datetime_format=infer_datetime_format,
origin=origin,
)
if arg_is_scalar:
# Calling squeeze directly on Snowpark pandas Series makes an unnecessary
# count sql call. To avoid that we convert Snowpark pandas Series to Native
# pandas series first.
# Note: When arg_is_scalar is True 'series_or_index' is always an Index.
return series_or_index.to_series().to_pandas().squeeze()
return series_or_index


@snowpark_pandas_telemetry_standalone_function_decorator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@
the specified time units.
"""

AUTO_FORMAT_WARNING_MSG = """Snowflake automatic format detection is used when a format is not provided.
In this case Snowflake's auto format may yield different result values compared to pandas.
See https://docs.snowflake.com/en/sql-reference/date-time-input-output#supported-formats-for-auto-detection for details
"""

# TODO: SNOW-1127160: support other units
VALID_TO_DATETIME_UNIT = ["D", "s", "ms", "us", "ns"]

Expand Down Expand Up @@ -304,9 +309,7 @@ def generate_timestamp_col(
if isinstance(datatype, (StringType, VariantType)):
WarningMessage.mismatch_with_pandas(
"to_datetime",
"Snowpark pandas to_datetime uses Snowflake's automatic format "
"detection to convert string to datetime when a format is not provided. "
"In this case Snowflake's auto format may yield different result values compared to pandas.",
AUTO_FORMAT_WARNING_MSG.replace("\n", ""),
)

from snowflake.snowpark.modin.plugin._internal.type_utils import (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def ignored_argument(cls, operation: str, argument: str, message: str) -> None:
@classmethod
def mismatch_with_pandas(cls, operation: str, message: str) -> None:
cls.single_warning(
f"`{operation}` implementation has mismatches with pandas:\n{message}."
f"`{operation}` implementation may has mismatches with pandas:\n{message}."
)

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/modin/frame/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_astype_to_timedelta(dtype):
eval_snowpark_pandas_result(snow_df, native_df, lambda df: df.astype(dtype))


@sql_count_checker(query_count=2)
@sql_count_checker(query_count=0)
def test_astype_to_timedelta_negative():
native_datetime_df = native_pd.DataFrame(
data={"col1": [pd.to_datetime("2000-01-01"), pd.to_datetime("2001-01-01")]}
Expand Down
45 changes: 25 additions & 20 deletions tests/integ/modin/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_to_datetime_format(self, cache, box, format, expected):
["1/3/2000", "20000103", "%m/%d/%Y"],
],
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=0)
def test_to_datetime_format_scalar(self, cache, arg, expected, format):
result = to_datetime(arg, format=format, cache=cache)
expected = Timestamp(expected)
Expand All @@ -120,7 +120,7 @@ def test_to_datetime_format_scalar(self, cache, arg, expected, format):
def test_to_datetime_format_unimplemented(self, cache, arg, format):
with pytest.raises(NotImplementedError):
assert to_datetime(
arg, format=format, cache=cache
pd.Index([arg]), format=format, cache=cache
) == native_pd.to_datetime(arg, format=format, cache=cache)

@pytest.mark.parametrize(
Expand All @@ -135,7 +135,7 @@ def test_to_datetime_format_not_match(self, cache, arg, format):
SnowparkSQLException,
match=f"Can't parse '{arg}' as timestamp with format 'DD/MM/YYYY'",
):
to_datetime(arg, format=format, cache=cache)
to_datetime(pd.Index([arg]), format=format, cache=cache).to_pandas()

@sql_count_checker(query_count=2, udf_count=0)
def test_to_datetime_format_YYYYMMDD(self, cache):
Expand Down Expand Up @@ -302,7 +302,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected):
@sql_count_checker(query_count=2)
def test_to_datetime_with_NA(self, data, format, expected):
# GH#42957
result = to_datetime(data, format=format)
result = to_datetime(pd.Index(data), format=format)
assert_index_equal(result, pd.DatetimeIndex(expected))

@sql_count_checker(query_count=1, udf_count=0)
Expand All @@ -328,7 +328,7 @@ def test_to_datetime_format_integer_year_month(self, cache):
result = to_datetime(ser, format="%Y%m", cache=cache)
assert_series_equal(result, expected, check_index_type=False)

@sql_count_checker(query_count=1)
@sql_count_checker(query_count=0)
def test_to_datetime_format_microsecond(self, cache):
month_abbr = calendar.month_abbr[4]
val = f"01-{month_abbr}-2011 00:00:01.978"
Expand Down Expand Up @@ -384,7 +384,9 @@ def test_to_datetime_format_microsecond(self, cache):
)
@sql_count_checker(query_count=1)
def test_to_datetime_format_time(self, cache, value, format, dt):
assert to_datetime(value, format=format, cache=cache) == dt
assert (
to_datetime(pd.Index([value]), format=format, cache=cache).to_pandas() == dt
)

@sql_count_checker(query_count=0)
def test_to_datetime_with_non_exact_unimplemented(self, cache):
Expand All @@ -407,9 +409,9 @@ def test_to_datetime_with_non_exact_unimplemented(self, cache):
"2012-01-01 09:00:00.001000000",
],
)
@sql_count_checker(query_count=2)
@sql_count_checker(query_count=1, join_count=1)
def test_parse_nanoseconds_with_formula(self, cache, arg):

arg = pd.Index([arg])
# GH8989
# truncating the nanoseconds when a format was provided
expected = to_datetime(arg, cache=cache)
Expand All @@ -426,7 +428,10 @@ def test_parse_nanoseconds_with_formula(self, cache, arg):
@sql_count_checker(query_count=0)
def test_to_datetime_format_weeks(self, value, fmt, expected, cache):
with pytest.raises(NotImplementedError):
assert to_datetime(value, format=fmt, cache=cache) == expected
assert (
to_datetime(pd.Index([value]), format=fmt, cache=cache).to_pandas()[0]
== expected
)

@pytest.mark.parametrize(
"fmt,dates,expected_dates",
Expand Down Expand Up @@ -497,7 +502,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_fallback(
):
# GH 13486
with pytest.raises(NotImplementedError):
to_datetime(dates, format=fmt).to_list()
to_datetime(pd.Index(dates), format=fmt).to_list()

@sql_count_checker(query_count=4)
def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self):
Expand Down Expand Up @@ -535,7 +540,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
SnowparkSQLException,
match="Can't parse|as timestamp with format 'YYYY-MM-DD HH24:MI:SS TZHTZM'",
):
to_datetime([date], format=fmt).to_pandas()
to_datetime(pd.Index([date]), format=fmt).to_pandas()

@sql_count_checker(query_count=0)
def test_to_datetime_parse_timezone_keeps_name(self):
Expand All @@ -551,15 +556,15 @@ class TestToDatetime:
def test_to_datetime_mixed_datetime_and_string(self):
d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1)))
d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1)))
res = to_datetime(["2020-01-01 17:00:00 -0100", d2])
res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]))
# The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the
# result ignores the timezone by default
expected = native_pd.DatetimeIndex(
[datetime(2020, 1, 1, 17), datetime(2020, 1, 1, 18)]
)
assert_index_equal(res, expected)
# Set utc=True to make sure timezone aware in to_datetime
res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True)
res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]), utc=True)
expected = pd.DatetimeIndex([d1, d2])
assert_index_equal(res, expected)

Expand All @@ -584,15 +589,15 @@ def test_to_datetime_dtarr(self, tz):

@sql_count_checker(query_count=1)
def test_to_datetime_pydatetime(self):
actual = to_datetime(datetime(2008, 1, 15))
actual = to_datetime(pd.Index([datetime(2008, 1, 15)]))
assert actual == np.datetime64(datetime(2008, 1, 15))

@pytest.mark.parametrize(
"dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")]
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=1, join_count=2)
def test_to_datetime_dt64s(self, cache, dt):
assert to_datetime(dt, cache=cache) == Timestamp(dt)
assert to_datetime(pd.Index([dt]), cache=cache)[0] == Timestamp(dt)

@pytest.mark.parametrize(
"sample",
Expand Down Expand Up @@ -831,11 +836,11 @@ def test_to_datetime_df_negative(self):
{"arg": 1490195805433502912, "unit": "ns"},
],
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=1, join_count=2)
def test_to_datetime_unit(self, sample):
assert pd.to_datetime(
sample["arg"], unit=sample["unit"]
) == native_pd.to_datetime(sample["arg"], unit=sample["unit"])
assert pd.to_datetime(pd.Index([sample["arg"]]), unit=sample["unit"])[
0
] == native_pd.to_datetime(sample["arg"], unit=sample["unit"])

@sql_count_checker(query_count=0)
def test_to_datetime_unit_negative(self):
Expand Down
Loading

0 comments on commit 0c28a0c

Please sign in to comment.