Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1620436 Improved to_datetime to handle all local input cases and improved time series notebook tests #2184

Merged
merged 5 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
#### Improvements

- Refactored `quoted_identifier_to_snowflake_type` to avoid making metadata queries if the types have been cached locally.
- Improved `pd.to_datetime` to handle all local input cases.

#### Bug Fixes

Expand Down
55 changes: 29 additions & 26 deletions src/snowflake/snowpark/modin/pandas/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -1742,16 +1742,13 @@ def to_datetime(

The default behaviour (``utc=False``) is as follows:

- Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
- Timezone-naive inputs are kept as timezone-naive :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex`:

>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
>>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)

- Timezone-aware inputs *with constant time offset* are still converted to
timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.

>>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)

- Use right format to convert to timezone-aware type (Note that when call Snowpark
pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
Expand All @@ -1763,17 +1760,17 @@ def to_datetime(
issued from a timezone with daylight savings, such as Europe/Paris):

>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]')

>>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
Index(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns]')

Setting ``utc=True`` makes sure always convert to timezone-aware outputs:

- Timezone-naive inputs are *localized* based on the session timezone

>>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
DatetimeIndex(['2018-10-26 05:00:00-07:00', '2018-10-26 06:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)

- Timezone-aware inputs are *converted* to session timezone

Expand All @@ -1784,8 +1781,28 @@ def to_datetime(
# TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
raise_if_native_pandas_objects(arg)

if arg is None:
return None # same as pandas
if not isinstance(arg, (DataFrame, Series, pd.Index)):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

match with what we do today for DatetimeIndex and timedelta.

# use pandas.to_datetime to convert local data to datetime
res = pandas.to_datetime(
arg,
errors,
dayfirst,
yearfirst,
utc,
format,
exact,
unit,
infer_datetime_format,
origin,
cache,
)
if isinstance(res, pandas.Series):
res = pd.Series(res)
elif not is_scalar(res):
res = pd.Index(res)
return res

# handle modin objs
if unit and unit not in VALID_TO_DATETIME_UNIT:
raise ValueError(f"Unrecognized unit {unit}")

Expand All @@ -1795,15 +1812,8 @@ def to_datetime(
argument="cache",
message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
)
arg_is_scalar = is_scalar(arg)

if not isinstance(arg, (DataFrame, Series, pd.Index)):
# Turn dictionary like arg into pd.DataFrame and list-like or scalar to
# pd.Index.
arg = [arg] if arg_is_scalar else arg
arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)

series_or_index = arg._to_datetime(
return arg._to_datetime(
errors=errors,
dayfirst=dayfirst,
yearfirst=yearfirst,
Expand All @@ -1814,13 +1824,6 @@ def to_datetime(
infer_datetime_format=infer_datetime_format,
origin=origin,
)
if arg_is_scalar:
# Calling squeeze directly on Snowpark pandas Series makes an unnecessary
# count sql call. To avoid that we convert Snowpark pandas Series to Native
# pandas series first.
# Note: When arg_is_scalar is True 'series_or_index' is always an Index.
return series_or_index.to_series().to_pandas().squeeze()
return series_or_index


@snowpark_pandas_telemetry_standalone_function_decorator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@
the specified time units.
"""

AUTO_FORMAT_WARNING_MSG = """Snowflake automatic format detection is used when a format is not provided.
In this case Snowflake's auto format may yield different result values compared to pandas.
See https://docs.snowflake.com/en/sql-reference/date-time-input-output#supported-formats-for-auto-detection for details
"""

# TODO: SNOW-1127160: support other units
VALID_TO_DATETIME_UNIT = ["D", "s", "ms", "us", "ns"]

Expand Down Expand Up @@ -304,9 +309,7 @@ def generate_timestamp_col(
if isinstance(datatype, (StringType, VariantType)):
WarningMessage.mismatch_with_pandas(
"to_datetime",
"Snowpark pandas to_datetime uses Snowflake's automatic format "
"detection to convert string to datetime when a format is not provided. "
"In this case Snowflake's auto format may yield different result values compared to pandas.",
AUTO_FORMAT_WARNING_MSG.replace("\n", ""),
)

from snowflake.snowpark.modin.plugin._internal.type_utils import (
Expand Down
4 changes: 2 additions & 2 deletions src/snowflake/snowpark/modin/plugin/docstrings/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def ffill():
2020-01-06 3
Freq: None, dtype: int64

>>> lst2 = pd.to_datetime(['2023-01-03 1:00:00', '2023-01-04', '2023-01-05 23:00:00', '2023-01-06', '2023-01-07 2:00:00', '2023-01-10'])
>>> lst2 = pd.to_datetime(pd.Index(['2023-01-03 1:00:00', '2023-01-04', '2023-01-05 23:00:00', '2023-01-06', '2023-01-07 2:00:00', '2023-01-10']))
>>> ser2 = pd.Series([1, 2, 3, 4, None, 6], index=lst2)
>>> ser2
2023-01-03 01:00:00 1.0
Expand Down Expand Up @@ -257,7 +257,7 @@ def ffill():
2020-01-03 0 15
2020-01-06 2 17

>>> index2 = pd.to_datetime(['2023-01-03 1:00:00', '2023-01-04', '2023-01-05 23:00:00', '2023-01-06', '2023-01-07 2:00:00', '2023-01-10'])
>>> index2 = pd.to_datetime(pd.Index(['2023-01-03 1:00:00', '2023-01-04', '2023-01-05 23:00:00', '2023-01-06', '2023-01-07 2:00:00', '2023-01-10']))
>>> df2 = pd.DataFrame({'a': range(len(index2)),
... 'b': range(len(index2) + 10, len(index2) * 2 + 10)},
... index=index2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def ignored_argument(cls, operation: str, argument: str, message: str) -> None:
@classmethod
def mismatch_with_pandas(cls, operation: str, message: str) -> None:
cls.single_warning(
f"`{operation}` implementation has mismatches with pandas:\n{message}."
f"`{operation}` implementation may have mismatches with pandas:\n{message}."
)

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/modin/frame/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_astype_to_timedelta(dtype):
eval_snowpark_pandas_result(snow_df, native_df, lambda df: df.astype(dtype))


@sql_count_checker(query_count=2)
@sql_count_checker(query_count=0)
def test_astype_to_timedelta_negative():
native_datetime_df = native_pd.DataFrame(
data={"col1": [pd.to_datetime("2000-01-01"), pd.to_datetime("2001-01-01")]}
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/modin/series/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def test_astype_to_timedelta(data):
)


@sql_count_checker(query_count=2)
@sql_count_checker(query_count=0)
def test_astype_to_timedelta_negative():
native_datetime_series = native_pd.Series(
data=[pd.to_datetime("2000-01-01"), pd.to_datetime("2001-01-01")]
Expand Down
45 changes: 25 additions & 20 deletions tests/integ/modin/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_to_datetime_format(self, cache, box, format, expected):
["1/3/2000", "20000103", "%m/%d/%Y"],
],
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=0)
def test_to_datetime_format_scalar(self, cache, arg, expected, format):
result = to_datetime(arg, format=format, cache=cache)
expected = Timestamp(expected)
Expand All @@ -120,7 +120,7 @@ def test_to_datetime_format_scalar(self, cache, arg, expected, format):
def test_to_datetime_format_unimplemented(self, cache, arg, format):
with pytest.raises(NotImplementedError):
assert to_datetime(
arg, format=format, cache=cache
pd.Index([arg]), format=format, cache=cache
) == native_pd.to_datetime(arg, format=format, cache=cache)

@pytest.mark.parametrize(
Expand All @@ -135,7 +135,7 @@ def test_to_datetime_format_not_match(self, cache, arg, format):
SnowparkSQLException,
match=f"Can't parse '{arg}' as timestamp with format 'DD/MM/YYYY'",
):
to_datetime(arg, format=format, cache=cache)
to_datetime(pd.Index([arg]), format=format, cache=cache).to_pandas()

@sql_count_checker(query_count=2, udf_count=0)
def test_to_datetime_format_YYYYMMDD(self, cache):
Expand Down Expand Up @@ -302,7 +302,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected):
@sql_count_checker(query_count=2)
def test_to_datetime_with_NA(self, data, format, expected):
# GH#42957
result = to_datetime(data, format=format)
result = to_datetime(pd.Index(data), format=format)
assert_index_equal(result, pd.DatetimeIndex(expected))

@sql_count_checker(query_count=1, udf_count=0)
Expand All @@ -328,7 +328,7 @@ def test_to_datetime_format_integer_year_month(self, cache):
result = to_datetime(ser, format="%Y%m", cache=cache)
assert_series_equal(result, expected, check_index_type=False)

@sql_count_checker(query_count=1)
@sql_count_checker(query_count=0)
def test_to_datetime_format_microsecond(self, cache):
month_abbr = calendar.month_abbr[4]
val = f"01-{month_abbr}-2011 00:00:01.978"
Expand Down Expand Up @@ -384,7 +384,9 @@ def test_to_datetime_format_microsecond(self, cache):
)
@sql_count_checker(query_count=1)
def test_to_datetime_format_time(self, cache, value, format, dt):
assert to_datetime(value, format=format, cache=cache) == dt
assert (
to_datetime(pd.Index([value]), format=format, cache=cache).to_pandas() == dt
)

@sql_count_checker(query_count=0)
def test_to_datetime_with_non_exact_unimplemented(self, cache):
Expand All @@ -407,9 +409,9 @@ def test_to_datetime_with_non_exact_unimplemented(self, cache):
"2012-01-01 09:00:00.001000000",
],
)
@sql_count_checker(query_count=2)
@sql_count_checker(query_count=1, join_count=1)
def test_parse_nanoseconds_with_formula(self, cache, arg):

arg = pd.Index([arg])
# GH8989
# truncating the nanoseconds when a format was provided
expected = to_datetime(arg, cache=cache)
Expand All @@ -426,7 +428,10 @@ def test_parse_nanoseconds_with_formula(self, cache, arg):
@sql_count_checker(query_count=0)
def test_to_datetime_format_weeks(self, value, fmt, expected, cache):
with pytest.raises(NotImplementedError):
assert to_datetime(value, format=fmt, cache=cache) == expected
assert (
to_datetime(pd.Index([value]), format=fmt, cache=cache).to_pandas()[0]
== expected
)

@pytest.mark.parametrize(
"fmt,dates,expected_dates",
Expand Down Expand Up @@ -497,7 +502,7 @@ def test_to_datetime_parse_tzname_or_tzoffset_fallback(
):
# GH 13486
with pytest.raises(NotImplementedError):
to_datetime(dates, format=fmt).to_list()
to_datetime(pd.Index(dates), format=fmt).to_list()

@sql_count_checker(query_count=4)
def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self):
Expand Down Expand Up @@ -535,7 +540,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
SnowparkSQLException,
match="Can't parse|as timestamp with format 'YYYY-MM-DD HH24:MI:SS TZHTZM'",
):
to_datetime([date], format=fmt).to_pandas()
to_datetime(pd.Index([date]), format=fmt).to_pandas()

@sql_count_checker(query_count=0)
def test_to_datetime_parse_timezone_keeps_name(self):
Expand All @@ -551,15 +556,15 @@ class TestToDatetime:
def test_to_datetime_mixed_datetime_and_string(self):
d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1)))
d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1)))
res = to_datetime(["2020-01-01 17:00:00 -0100", d2])
res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]))
# The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the
# result ignores the timezone by default
expected = native_pd.DatetimeIndex(
[datetime(2020, 1, 1, 17), datetime(2020, 1, 1, 18)]
)
assert_index_equal(res, expected)
# Set utc=True to make sure timezone aware in to_datetime
res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True)
res = to_datetime(pd.Index(["2020-01-01 17:00:00 -0100", d2]), utc=True)
expected = pd.DatetimeIndex([d1, d2])
assert_index_equal(res, expected)

Expand All @@ -584,15 +589,15 @@ def test_to_datetime_dtarr(self, tz):

@sql_count_checker(query_count=1)
def test_to_datetime_pydatetime(self):
actual = to_datetime(datetime(2008, 1, 15))
actual = to_datetime(pd.Index([datetime(2008, 1, 15)]))
assert actual == np.datetime64(datetime(2008, 1, 15))

@pytest.mark.parametrize(
"dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")]
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=1, join_count=2)
def test_to_datetime_dt64s(self, cache, dt):
assert to_datetime(dt, cache=cache) == Timestamp(dt)
assert to_datetime(pd.Index([dt]), cache=cache)[0] == Timestamp(dt)

@pytest.mark.parametrize(
"sample",
Expand Down Expand Up @@ -831,11 +836,11 @@ def test_to_datetime_df_negative(self):
{"arg": 1490195805433502912, "unit": "ns"},
],
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=1, join_count=2)
def test_to_datetime_unit(self, sample):
assert pd.to_datetime(
sample["arg"], unit=sample["unit"]
) == native_pd.to_datetime(sample["arg"], unit=sample["unit"])
assert pd.to_datetime(pd.Index([sample["arg"]]), unit=sample["unit"])[
0
] == native_pd.to_datetime(sample["arg"], unit=sample["unit"])

@sql_count_checker(query_count=0)
def test_to_datetime_unit_negative(self):
Expand Down
Loading
Loading