From 386e7d5bdfc01be389e4f7d1370b3d980e4a7454 Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Wed, 7 Aug 2024 10:55:05 -0700 Subject: [PATCH 1/4] SNOW-1486910: Return DatetimeIndex from to_datetime and date_range --- CHANGELOG.md | 2 + .../snowpark/modin/pandas/general.py | 167 ++++++------------ .../compiler/snowflake_query_compiler.py | 19 +- .../snowpark/modin/plugin/extensions/index.py | 64 ++++++- tests/integ/modin/tools/test_date_range.py | 8 +- tests/integ/modin/tools/test_to_datetime.py | 50 +++--- tests/integ/modin/utils.py | 3 +- 7 files changed, 166 insertions(+), 147 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc28d76f3b8..4339973da47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -79,6 +79,8 @@ ### Behavior change - `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object. - Refactor and introduce `query_compiler` argument in `Index` constructor to create `Index` from query compiler. +- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object. +- `pd.date_range` now returns a DatetimeIndex object instead of a Series object. ## 1.20.0 (2024-07-17) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 468b31f5367..417d1edca5f 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1572,10 +1572,7 @@ def to_datetime( >>> pd.to_datetime([1, 2, 3], unit='D', ... origin=pd.Timestamp('1960-01-01')) - 0 1960-01-02 - 1 1960-01-03 - 2 1960-01-04 - dtype: datetime64[ns] + DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None) **Non-convertible date/times** @@ -1589,9 +1586,7 @@ def to_datetime( in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. >>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce') - 0 NaT - 1 NaT - dtype: datetime64[ns] + DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) .. _to_datetime_tz_examples: @@ -1603,55 +1598,41 @@ def to_datetime( - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`: >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) - 0 2018-10-26 12:00:00 - 1 2018-10-26 13:00:15 - dtype: datetime64[ns] + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) - Timezone-aware inputs *with constant time offset* are still converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default. >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500']) - 0 2018-10-26 12:00:00 - 1 2018-10-26 13:00:00 - dtype: datetime64[ns] + DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None) - Use right format to convert to timezone-aware type (Note that when call Snowpark pandas API to_pandas() the timezone-aware output will always be converted to session timezone): >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z") - 0 2018-10-26 10:00:00-07:00 - 1 2018-10-26 11:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) - Timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris): >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100']) - 0 2020-10-25 02:00:00 - 1 2020-10-25 04:00:00 - dtype: datetime64[ns] + DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None) >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z") - 0 2020-10-24 17:00:00-07:00 - 1 2020-10-24 20:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) Setting ``utc=True`` makes sure always convert to timezone-aware outputs: - Timezone-naive inputs are *localized* based on the session timezone >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True) - 0 2018-10-26 12:00:00-07:00 - 1 2018-10-26 13:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) - Timezone-aware inputs are *converted* to session timezone >>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'], ... utc=True) - 0 2018-10-26 10:30:00-07:00 - 1 2018-10-26 10:00:00-07:00 - dtype: datetime64[ns, America/Los_Angeles] + DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None) """ # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py raise_if_native_pandas_objects(arg) @@ -1668,22 +1649,14 @@ def to_datetime( message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied", ) arg_is_scalar = is_scalar(arg) - # handle empty array, list, dict - if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0: - return arg if isinstance(arg, Series) else Series(arg) # always return a Series - if not isinstance(arg, (DataFrame, Series)): - # turn dictionary like arg into DataFrame and list like or scalar to Series - if isinstance(arg, dict): - arg = DataFrame(arg) # pragma: no cover - else: - name = None - # keep index name - if isinstance(arg, pd.Index): - name = arg.name - arg = Series(arg) - arg.name = name - - series = arg._to_datetime( + + if not isinstance(arg, (DataFrame, Series, pd.Index)): + # Turn dictionary like arg into pd.DataFrame and list-like or scalar to + # pd.Index. + arg = [arg] if arg_is_scalar else arg + arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg) + + series_or_index = arg._to_datetime( errors=errors, dayfirst=dayfirst, yearfirst=yearfirst, @@ -1697,9 +1670,10 @@ def to_datetime( if arg_is_scalar: # Calling squeeze directly on Snowpark pandas Series makes an unnecessary # count sql call. To avoid that we convert Snowpark pandas Series to Native - # pandas seris first. - return series.to_pandas().squeeze() - return series + # pandas series first. + # Note: When arg_is_scalar is True 'series_or_index' is always an Index. + return series_or_index.to_series().to_pandas().squeeze() + return series_or_index @snowpark_pandas_telemetry_standalone_function_decorator @@ -2004,9 +1978,9 @@ def date_range( name: Hashable | None = None, inclusive: IntervalClosedType = "both", **kwargs, -) -> Series: +) -> pd.DatetimeIndex: """ - Return a fixed frequency series. + Return a fixed frequency DatetimeIndex. Returns the range of equally spaced time points (where the difference between any two adjacent points is specified by the given frequency) such that they all @@ -2078,109 +2052,72 @@ def date_range( Specify `start` and `end`, with the default daily frequency. >>> pd.date_range(start='1/1/2018', end='1/08/2018') - 0 2018-01-01 - 1 2018-01-02 - 2 2018-01-03 - 3 2018-01-04 - 4 2018-01-05 - 5 2018-01-06 - 6 2018-01-07 - 7 2018-01-08 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq=None) Specify `start` and `periods`, the number of periods (days). >>> pd.date_range(start='1/1/2018', periods=8) - 0 2018-01-01 - 1 2018-01-02 - 2 2018-01-03 - 3 2018-01-04 - 4 2018-01-05 - 5 2018-01-06 - 6 2018-01-07 - 7 2018-01-08 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq=None) Specify `end` and `periods`, the number of periods (days). >>> pd.date_range(end='1/1/2018', periods=8) - 0 2017-12-25 - 1 2017-12-26 - 2 2017-12-27 - 3 2017-12-28 - 4 2017-12-29 - 5 2017-12-30 - 6 2017-12-31 - 7 2018-01-01 - dtype: datetime64[ns] + DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', + '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq=None) Specify `start`, `end`, and `periods`; the frequency is generated automatically (linearly spaced). >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3) - 0 2018-04-24 00:00:00 - 1 2018-04-25 12:00:00 - 2 2018-04-27 00:00:00 - dtype: datetime64[ns] + DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', + '2018-04-27 00:00:00'], + dtype='datetime64[ns]', freq=None) + **Other Parameters** Changed the `freq` (frequency) to ``'ME'`` (month end frequency). >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') - 0 2018-01-31 - 1 2018-02-28 - 2 2018-03-31 - 3 2018-04-30 - 4 2018-05-31 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', + '2018-05-31'], + dtype='datetime64[ns]', freq=None) Multiples are allowed >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') - 0 2018-01-31 - 1 2018-04-30 - 2 2018-07-31 - 3 2018-10-31 - 4 2019-01-31 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq=None) `freq` can also be specified as an Offset object. >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) - 0 2018-01-31 - 1 2018-04-30 - 2 2018-07-31 - 3 2018-10-31 - 4 2019-01-31 - dtype: datetime64[ns] + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq=None) `inclusive` controls whether to include `start` and `end` that are on the boundary. The default, "both", includes boundary points on either end. >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both") - 0 2017-01-01 - 1 2017-01-02 - 2 2017-01-03 - 3 2017-01-04 - dtype: datetime64[ns] + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None) Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left') - 0 2017-01-01 - 1 2017-01-02 - 2 2017-01-03 - dtype: datetime64[ns] + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None) Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and similarly ``inclusive='neither'`` will exclude both `start` and `end`. >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right') - 0 2017-01-02 - 1 2017-01-03 - 2 2017-01-04 - dtype: datetime64[ns] + DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None) """ # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py @@ -2229,9 +2166,11 @@ def date_range( left_inclusive=left_inclusive, right_inclusive=right_inclusive, ) - s = Series(query_compiler=qc) - s.name = name - return s + # Set date range as index column. + qc = qc.set_index_from_columns(qc.columns.tolist()) + # Set index column name. + qc = qc.set_index_names([name]) + return pd.DatetimeIndex(data=qc) @snowpark_pandas_telemetry_standalone_function_decorator diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 22ac143dfe2..17fcf35f6d7 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5551,7 +5551,7 @@ def set_index_from_columns( for ( ids ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( - keys + keys, include_index=False ): # Error checking for missing labels is already done in frontend layer. index_column_snowflake_quoted_identifiers.append(ids[0]) @@ -5870,6 +5870,7 @@ def series_to_datetime( unit: Optional[str] = None, infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default, origin: DateTimeOrigin = "unix", + include_index: bool = False, ) -> "SnowflakeQueryCompiler": """ Convert series to the datetime dtype. @@ -5884,6 +5885,7 @@ def series_to_datetime( unit: to_datetime unit infer_datetime_format: to_datetime infer_datetime_format origin: to_datetime origin + include_index: If True, also convert index columns to datetime. Returns: SnowflakeQueryCompiler: QueryCompiler with a single data column converted to datetime dtype. @@ -5896,12 +5898,16 @@ def series_to_datetime( to_snowflake_timestamp_format(format) if format is not None else None ) id_to_sf_type_map = self._modin_frame.quoted_identifier_to_snowflake_type() - col_id = self._modin_frame.data_column_snowflake_quoted_identifiers[0] - sf_type = id_to_sf_type_map[col_id] + col_ids = [] + if include_index: + col_ids = self._modin_frame.index_column_snowflake_quoted_identifiers + col_ids.extend(self._modin_frame.data_column_snowflake_quoted_identifiers) - if isinstance(sf_type, BooleanType): - # bool is not allowed in to_datetime (but note that bool is allowed by astype) - raise TypeError("dtype bool cannot be converted to datetime64[ns]") + for col_id in col_ids: + sf_type = id_to_sf_type_map[col_id] + if isinstance(sf_type, BooleanType): + # bool is not allowed in to_datetime (but note that bool is allowed by astype) + raise TypeError("dtype bool cannot be converted to datetime64[ns]") to_datetime_cols = { col_id: generate_timestamp_col( @@ -5913,6 +5919,7 @@ def series_to_datetime( unit="ns" if unit is None else unit, origin=origin, ) + for col_id in col_ids } return SnowflakeQueryCompiler( self._modin_frame.update_snowflake_quoted_identifiers_with_expressions( diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index ef94d7bb7ea..e11ac325f0d 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -30,7 +30,7 @@ import numpy as np import pandas as native_pd from pandas._libs import lib -from pandas._typing import ArrayLike, DtypeObj, NaPosition +from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import is_datetime64_any_dtype, pandas_dtype @@ -39,6 +39,7 @@ from snowflake.snowpark.modin.pandas.base import BasePandasDataset from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta +from snowflake.snowpark.modin.plugin._internal.timestamp_utils import DateTimeOrigin from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import ( SnowflakeQueryCompiler, ) @@ -2368,3 +2369,64 @@ def str(self) -> native_pd.core.strings.accessor.StringMethods: """ WarningMessage.index_to_pandas_warning("str") return self.to_pandas().str + + def _to_datetime( + self, + errors: DateTimeErrorChoices = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + utc: bool = False, + format: str = None, + exact: bool | lib.NoDefault = lib.no_default, + unit: str = None, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, + origin: DateTimeOrigin = "unix", + ) -> Index: + """ + Args: + errors: {'ignore', 'raise', 'coerce'}, default 'raise' + If 'raise', then invalid parsing will raise an exception. + If 'coerce', then invalid parsing will be set as NaT. + If 'ignore', then invalid parsing will return the input. + dayfirst: bool, default False + Specify a date parse order if arg is str or is list-like. + yearfirst: bool, default False + Specify a date parse order if arg is str or is list-like. + utc: bool, default False + Control timezone-related parsing, localization and conversion. + format: str, default None + The strftime to parse time + exact: bool, default True + Control how format is used: + True: require an exact format match. + False: allow the format to match anywhere in the target string. + unit: str, default 'ns' + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer + or float number. + infer_datetime_format: bool, default False + If True and no format is given, attempt to infer the format of the \ + datetime strings based on the first non-NaN element. + origin: scalar, default 'unix' + Define the reference date. The numeric values would be parsed as number + of units (defined by unit) since this reference date. + + Returns: + DatetimeIndex + """ + from snowflake.snowpark.modin.plugin.extensions.datetime_index import ( + DatetimeIndex, + ) + + new_qc = self._query_compiler.series_to_datetime( + errors, + dayfirst, + yearfirst, + utc, + format, + exact, + unit, + infer_datetime_format, + origin, + include_index=True, + ) + return DatetimeIndex(data=new_qc) diff --git a/tests/integ/modin/tools/test_date_range.py b/tests/integ/modin/tools/test_date_range.py index a1f3c4d0ba2..d26861442f4 100644 --- a/tests/integ/modin/tools/test_date_range.py +++ b/tests/integ/modin/tools/test_date_range.py @@ -63,7 +63,7 @@ @sql_count_checker(query_count=1) def test_regular_range(kwargs): assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) @@ -125,7 +125,7 @@ def test_regular_range(kwargs): @sql_count_checker(query_count=1) def test_irregular_range(kwargs): assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) @@ -168,7 +168,7 @@ def test_without_freq(periods, inclusive): "inclusive": inclusive, } assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) @@ -193,7 +193,7 @@ def test_without_freq(periods, inclusive): def test_inclusive(kwargs, inclusive): kwargs.update({"inclusive": inclusive}) assert_snowpark_pandas_equal_to_pandas( - pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs)) + pd.date_range(**kwargs), native_pd.date_range(**kwargs) ) diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py index d08495b31e9..07fb4aefebf 100644 --- a/tests/integ/modin/tools/test_to_datetime.py +++ b/tests/integ/modin/tools/test_to_datetime.py @@ -26,6 +26,7 @@ ) from tests.integ.modin.sql_counter import sql_count_checker from tests.integ.modin.utils import ( + assert_index_equal, assert_series_equal, assert_snowpark_pandas_equal_to_pandas, eval_snowpark_pandas_result, @@ -58,10 +59,10 @@ def test_to_datetime_readonly(self, readonly): if readonly: arr.setflags(write=False) result = to_datetime(arr) - expected = Series([], dtype=object) - assert_series_equal(result, expected) + expected = pd.DatetimeIndex([]) + assert_index_equal(result, expected) - @pytest.mark.parametrize("box", [Series, native_pd.Index]) + @pytest.mark.parametrize("box", [Series, pd.Index]) @pytest.mark.parametrize( "format, expected", [ @@ -79,11 +80,18 @@ def test_to_datetime_readonly(self, readonly): def test_to_datetime_format(self, cache, box, format, expected): values = box(["1/1/2000", "1/2/2000", "1/3/2000"]) result = to_datetime(values, format=format, cache=cache) - expected = Series(expected) - assert_series_equal(result, expected) + expected = box(expected) + if box is Series: + assert_series_equal(result, expected) + else: + assert_index_equal(result, expected) + # cache values is ignored at Snowpark pandas so only test here to make sure it works as well result = to_datetime(values, format=format, cache=False) - assert_series_equal(result, expected) + if box is Series: + assert_series_equal(result, expected) + else: + assert_index_equal(result, expected) @pytest.mark.parametrize( "arg, expected, format", @@ -235,9 +243,9 @@ def test_to_datetime_format_YYYYMMDD_with_none(self, input_s): # GH 30011 # format='yyyymmdd' # with None - expected = Series([Timestamp("19801222"), Timestamp("20010112"), NaT]) - result = Series(to_datetime(input_s, format="%Y%m%d")) - assert_series_equal(result, expected) + expected = pd.DatetimeIndex([Timestamp("19801222"), Timestamp("20010112"), NaT]) + result = to_datetime(input_s, format="%Y%m%d") + assert_index_equal(result, expected) @pytest.mark.parametrize( "input, expected", @@ -295,7 +303,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected): def test_to_datetime_with_NA(self, data, format, expected): # GH#42957 result = to_datetime(data, format=format) - assert_series_equal(result, Series(expected)) + assert_index_equal(result, pd.DatetimeIndex(expected)) @sql_count_checker(query_count=1, udf_count=0) def test_to_datetime_format_integer_year_only(self, cache): @@ -509,10 +517,10 @@ def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self): fmt = "%Y-%m-%d %H:%M:%S %z" result = to_datetime(dates, format=fmt, utc=True) - expected = Series(expected_dates) - assert_series_equal(result, expected) + expected = pd.DatetimeIndex(expected_dates) + assert_index_equal(result, expected) result2 = to_datetime(dates, utc=True) - assert_series_equal(result2, expected) + assert_index_equal(result2, expected) @pytest.mark.parametrize( "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] @@ -529,8 +537,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset): ): to_datetime([date], format=fmt).to_pandas() - # 2 extra queries to convert index to series - @sql_count_checker(query_count=2) + @sql_count_checker(query_count=0) def test_to_datetime_parse_timezone_keeps_name(self): # GH 21697 fmt = "%Y-%m-%d %H:%M:%S %z" @@ -547,14 +554,14 @@ def test_to_datetime_mixed_datetime_and_string(self): res = to_datetime(["2020-01-01 17:00:00 -0100", d2]) # The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the # result ignores the timezone by default - expected = native_pd.Series( + expected = native_pd.DatetimeIndex( [datetime(2020, 1, 1, 17), datetime(2020, 1, 1, 18)] ) - assert_series_equal(res, expected, check_dtype=False, check_index_type=False) + assert_index_equal(res, expected) # Set utc=True to make sure timezone aware in to_datetime res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True) - expected = pd.Series([d1, d2]) - assert_series_equal(res, expected, check_dtype=False, check_index_type=False) + expected = pd.DatetimeIndex([d1, d2]) + assert_index_equal(res, expected) @pytest.mark.parametrize( "tz", @@ -563,13 +570,14 @@ def test_to_datetime_mixed_datetime_and_string(self): pytest.param("US/Central"), ], ) - @sql_count_checker(query_count=2) + @sql_count_checker(query_count=3) def test_to_datetime_dtarr(self, tz): # DatetimeArray dti = native_pd.date_range("1965-04-03", periods=19, freq="2W", tz=tz) arr = DatetimeArray(dti) + # Use assert_series_equal to ignore timezone difference in dtype. assert_series_equal( - to_datetime(arr), + Series(to_datetime(arr)), Series(arr), check_dtype=False, ) diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index 6ec630024a3..06728f9e985 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -271,7 +271,8 @@ def assert_snowpark_pandas_equal_to_pandas( tm.assert_series_equal(snow_to_native, expected_pandas, **kwargs) else: assert isinstance(snow, Index) - kwargs.pop("check_dtype") + if "check_dtype" in kwargs: + kwargs.pop("check_dtype") if kwargs.pop("check_index_type"): kwargs.update(exact=False) tm.assert_index_equal(snow_to_native, expected_pandas, **kwargs) From 2b5a7725b3b1ad63a62222ea122aab74eaa32a3b Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Thu, 8 Aug 2024 21:45:42 -0700 Subject: [PATCH 2/4] Fix tests and update doctests --- .../snowpark/modin/pandas/general.py | 2 +- src/snowflake/snowpark/modin/pandas/series.py | 5 ++- .../compiler/snowflake_query_compiler.py | 5 ++- .../modin/plugin/docstrings/series_utils.py | 32 +++++++------------ tests/integ/modin/frame/test_duplicated.py | 2 +- tests/integ/modin/test_telemetry.py | 2 +- 6 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 417d1edca5f..4161d316b0e 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -2167,7 +2167,7 @@ def date_range( right_inclusive=right_inclusive, ) # Set date range as index column. - qc = qc.set_index_from_columns(qc.columns.tolist()) + qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False) # Set index column name. qc = qc.set_index_names([name]) return pd.DatetimeIndex(data=qc) diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index 12dc9d10972..a494b513de5 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -130,7 +130,10 @@ def __init__( # modified: # Engine.subscribe(_update_engine) - if isinstance(data, type(self)): + # Convert lazy index to Series without pulling the data to client. + if isinstance(data, pd.Index): + query_compiler = data.to_series(index=index, name=name)._query_compiler + elif isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: if any(i not in data.index for i in index): diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 17fcf35f6d7..faa087464e5 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -5530,6 +5530,7 @@ def set_index_from_columns( keys: list[Hashable], drop: Optional[bool] = True, append: Optional[bool] = False, + include_index: Optional[bool] = True, ) -> "SnowflakeQueryCompiler": """ Create or update index (row labels) from a list of columns. @@ -5542,6 +5543,8 @@ def set_index_from_columns( append: bool, default False Whether to add the columns in `keys` as new levels appended to the existing index. + include_index: bool, default True + Whether the keys can also include index column lables as well. Returns: A new QueryCompiler instance with updated index. @@ -5551,7 +5554,7 @@ def set_index_from_columns( for ( ids ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels( - keys, include_index=False + keys, include_index=include_index ): # Error checking for missing labels is already done in frontend layer. index_column_snowflake_quoted_identifiers.append(ids[0]) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py index ba729cce6a9..c2519fc76d6 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py @@ -1353,7 +1353,7 @@ def dayofweek(): Examples -------- - >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D') + >>> s = pd.Series(pd.date_range('2016-12-31', '2017-01-08', freq='D')) >>> s 0 2016-12-31 1 2017-01-01 @@ -1390,7 +1390,7 @@ def dayofyear(): Examples -------- - >>> s = pd.to_datetime(["1/1/2020", "2/1/2020"]) + >>> s = pd.Series(pd.to_datetime(["1/1/2020", "2/1/2020"])) >>> s 0 2020-01-01 1 2020-02-01 @@ -1670,9 +1670,8 @@ def is_leap_year(): This method is available on Series with datetime values under the .dt accessor, and directly on DatetimeIndex. >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE") - >>> idx # doctest: +SKIP - DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], - dtype='datetime64[ns]', freq='YE-DEC') + >>> idx + DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], dtype='datetime64[ns]', freq=None) >>> idx.is_leap_year # doctest: +SKIP array([ True, False, False]) @@ -1688,7 +1687,6 @@ def is_leap_year(): 2 False dtype: bool """ - # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex. @property def daysinmonth(): @@ -1762,22 +1760,19 @@ def month_name(): dtype: object >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='ME') + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) >>> idx.month_name() # doctest: +SKIP Index(['January', 'February', 'March'], dtype='object') Using the locale parameter you can set a different locale language, for example: idx.month_name(locale='pt_BR.utf8') will return month names in Brazilian Portuguese language. >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='ME') + >>> idx + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None) >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ - # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex. def day_name(): """ @@ -1808,22 +1803,19 @@ def day_name(): dtype: object >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], - dtype='datetime64[ns]', freq='D') + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) >>> idx.day_name() # doctest: +SKIP Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') Using the locale parameter you can set a different locale language, for example: idx.day_name(locale='pt_BR.utf8') will return day names in Brazilian Portuguese language. >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) - >>> idx # doctest: +SKIP - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], - dtype='datetime64[ns]', freq='D') + >>> idx + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None) >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Segunda', 'Terça', 'Quarta'], dtype='object') """ - # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex. def total_seconds(): pass diff --git a/tests/integ/modin/frame/test_duplicated.py b/tests/integ/modin/frame/test_duplicated.py index d0031adf0c4..e4c5d594ecc 100644 --- a/tests/integ/modin/frame/test_duplicated.py +++ b/tests/integ/modin/frame/test_duplicated.py @@ -93,7 +93,7 @@ def test_duplicated_on_empty_frame(): @sql_count_checker(query_count=3, join_count=2) def test_frame_datetime64_duplicated(): - dates = pd.date_range("2010-07-01", end="2010-08-05") + dates = pd.date_range("2010-07-01", end="2010-08-05").to_series() tst = pd.DataFrame({"symbol": "AAA", "date": dates}) result = tst.duplicated(["date", "symbol"]) diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index c6c17313489..c908b56c56a 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -325,7 +325,7 @@ def sample_function( ) @sql_count_checker(query_count=7, fallback_count=1, sproc_count=1) def test_property_methods_telemetry(): - datetime_series = pd.date_range("2000-01-01", periods=3, freq="h") + datetime_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h")) ret_series = datetime_series.dt.timetz assert len(ret_series._query_compiler.snowpark_pandas_api_calls) == 1 api_call = ret_series._query_compiler.snowpark_pandas_api_calls[0] From 159c81bdcc19c1883c878c38c76dd44b9e71d421 Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Fri, 9 Aug 2024 12:20:42 -0700 Subject: [PATCH 3/4] Fix query counts and ctor updates --- .../snowpark/modin/pandas/general.py | 7 ++-- src/snowflake/snowpark/modin/pandas/series.py | 1 + .../snowpark/modin/plugin/extensions/index.py | 2 +- tests/integ/modin/frame/test_loc.py | 39 +++++-------------- tests/integ/modin/frame/test_set_index.py | 6 +-- tests/integ/modin/series/test_loc.py | 24 +++++------- tests/integ/modin/test_concat.py | 4 +- tests/integ/modin/tools/test_to_datetime.py | 2 +- 8 files changed, 28 insertions(+), 57 deletions(-) diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py index 4161d316b0e..af0369771bf 100644 --- a/src/snowflake/snowpark/modin/pandas/general.py +++ b/src/snowflake/snowpark/modin/pandas/general.py @@ -1352,7 +1352,7 @@ def to_datetime( infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: Any = "unix", cache: bool = True, -) -> Series | DatetimeScalar | NaTType | None: +) -> pd.DatetimeIndex | Series | DatetimeScalar | NaTType | None: """ Convert argument to datetime. @@ -1459,8 +1459,7 @@ def to_datetime( parsing): - scalar: :class:`Timestamp` (or :class:`datetime.datetime`) - - array-like: :class:`~snowflake.snowpark.modin.pandas.Series` with :class:`datetime64` dtype containing - :class:`datetime.datetime` (or + - array-like: :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex` (or :class: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`object` dtype containing :class:`datetime.datetime`) - Series: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`datetime64` dtype (or @@ -2170,7 +2169,7 @@ def date_range( qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False) # Set index column name. qc = qc.set_index_names([name]) - return pd.DatetimeIndex(data=qc) + return pd.DatetimeIndex(query_compiler=qc) @snowpark_pandas_telemetry_standalone_function_decorator diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py index a494b513de5..f268a21306b 100644 --- a/src/snowflake/snowpark/modin/pandas/series.py +++ b/src/snowflake/snowpark/modin/pandas/series.py @@ -133,6 +133,7 @@ def __init__( # Convert lazy index to Series without pulling the data to client. if isinstance(data, pd.Index): query_compiler = data.to_series(index=index, name=name)._query_compiler + query_compiler = query_compiler.reset_index(drop=True) elif isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index e11ac325f0d..a3b4265708a 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -2429,4 +2429,4 @@ def _to_datetime( origin, include_index=True, ) - return DatetimeIndex(data=new_qc) + return DatetimeIndex(query_compiler=new_qc) diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py index f258f261b51..1012a0d3959 100644 --- a/tests/integ/modin/frame/test_loc.py +++ b/tests/integ/modin/frame/test_loc.py @@ -146,7 +146,7 @@ def test_df_loc_get_tuple_key( snow_row = row query_count = 1 - if is_scalar(row) or isinstance(row, tuple) or isinstance(row, native_pd.Index): + if is_scalar(row) or isinstance(row, tuple): query_count = 2 with SqlCounter( @@ -945,11 +945,7 @@ def loc_set_helper(df): _row_key = key_converter(row_key, df) df.loc[_row_key] = pd.DataFrame(item) - with SqlCounter( - # one extra query to convert to series to set item - query_count=2 if key_type == "index" else 1, - join_count=expected_join_count, - ): + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( pd.DataFrame(native_df), native_df, loc_set_helper, inplace=True ) @@ -971,11 +967,7 @@ def loc_set_helper(df): _row_key = key_converter(row_key, df) df.loc[_row_key, :] = pd.DataFrame(item) - with SqlCounter( - # one extra query to convert to series to set item - query_count=2 if key_type == "index" else 1, - join_count=expected_join_count, - ): + with SqlCounter(query_count=1, join_count=expected_join_count): eval_snowpark_pandas_result( pd.DataFrame(native_df), native_df, loc_set_helper, inplace=True ) @@ -1153,9 +1145,6 @@ def loc_set_helper(df): query_count, join_count = 1, 2 if not all(isinstance(rk_val, bool) for rk_val in row_key): join_count += 2 - # one extra query to convert to native pandas to initialize series and set item - if key_type == "index": - query_count = 2 if isinstance(col_key, native_pd.Series): query_count += 1 with SqlCounter(query_count=query_count, join_count=join_count): @@ -1235,10 +1224,6 @@ def loc_set_helper(df): if isinstance(col_key, native_pd.Series): query_count += 1 - # one extra query to convert to native pandas to initialize series and set item - if key_type == "index": - query_count += 1 - with SqlCounter( query_count=query_count, join_count=join_count, @@ -1316,8 +1301,7 @@ def loc_set_helper(df): else: df.loc[row_key, :] = pd.DataFrame(item) - # one extra query to convert index to native pandas to initialize series and set item - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=4): + with SqlCounter(query_count=1, join_count=4): if item.index.has_duplicates: # pandas fails to update duplicated rows with duplicated item with pytest.raises( @@ -1641,8 +1625,7 @@ def loc_helper(df): return _df.loc[_key] - # one extra query to convert index to native pandas to initialize series and set item - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, @@ -1985,8 +1968,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None): ) # default index - # one extra query to convert to series to set item - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( default_index_snowpark_pandas_df, default_index_native_df, @@ -2000,8 +1982,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None): "index" ) non_default_index_snowpark_pandas_df = pd.DataFrame(non_default_index_native_df) - # one extra query to convert to series to set item - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( non_default_index_snowpark_pandas_df, non_default_index_native_df, @@ -2021,8 +2002,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None): ] ) dup_snowpandas_df = pd.DataFrame(dup_native_df) - # one extra query to convert to series to set item - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( dup_snowpandas_df, dup_native_df, @@ -2047,8 +2027,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None): ] ) dup_snowpandas_df = pd.DataFrame(dup_native_df) - # one extra query to convert to series to set item - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( dup_snowpandas_df, dup_native_df, diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py index 15566d630f1..e0088673282 100644 --- a/tests/integ/modin/frame/test_set_index.py +++ b/tests/integ/modin/frame/test_set_index.py @@ -320,11 +320,7 @@ def test_set_index_pass_arrays_duplicate(obj_type1, obj_type2, drop, append, nat obj_type2 = native_pd.Index native_keys = [obj_type1(array), obj_type2(array)] - query_count = 4 - # one extra query per modin index to create the series and set index - query_count += 1 if obj_type1 == native_pd.Index else 0 - query_count += 1 if obj_type2 == native_pd.Index else 0 - with SqlCounter(query_count=query_count, join_count=2): + with SqlCounter(query_count=4, join_count=2): eval_snowpark_pandas_result( snow_df, native_df, diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 32c1bf64c4a..21fbf6aeafa 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -319,7 +319,7 @@ def loc_helper(ser): return _ser.loc[_key] default_index_series = pd.Series(default_index_native_series) - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( default_index_series, default_index_native_series, @@ -480,7 +480,7 @@ def type_convert(key, is_snow_type): # Note: here number of queries are 2 due to the data type of the series is variant and to_pandas needs to call # typeof to get the value types # TODO: SNOW-933782 optimize to_pandas for variant columns to only fire one query - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( default_index_snowpark_pandas_series, default_index_native_series, @@ -497,7 +497,7 @@ def type_convert(key, is_snow_type): non_default_index_snowpark_pandas_series = pd.Series( non_default_index_native_series ) - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( non_default_index_snowpark_pandas_series, non_default_index_native_series, @@ -514,7 +514,7 @@ def type_convert(key, is_snow_type): ] ) dup_snowpandas_series = pd.Series(dup_native_series) - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( dup_snowpandas_series, dup_native_series, @@ -539,7 +539,7 @@ def type_convert(key, is_snow_type): ] ) dup_snowpandas_series = pd.Series(dup_native_series) - with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1): + with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( dup_snowpandas_series, dup_native_series, @@ -776,15 +776,15 @@ def loc_set_helper(s): s.loc[_row_key] = _item query_count = 1 - # 6 extra queries: sum of two cases below + # 5 extra queries: sum of two cases below if item_type.startswith("index") and key_type.startswith("index"): - query_count = 7 + query_count = 6 # 4 extra queries: 1 query to convert item index to pandas in loc_set_helper, 2 for iter, and 1 for to_list elif item_type.startswith("index"): query_count = 5 - # 2 extra queries: 1 query to convert key index to pandas in loc_set_helper and 1 to convert to series to setitem + # 1 extra query to convert to series to setitem elif key_type.startswith("index"): - query_count = 3 + query_count = 2 with SqlCounter(query_count=query_count, join_count=expected_join_count): eval_snowpark_pandas_result( pd.Series(series), series, loc_set_helper, inplace=True @@ -834,11 +834,7 @@ def loc_set_helper(s): else: s.loc[pd.Series(row_key)] = pd.DataFrame(item) - qc = 0 - if key_type == "index": - qc = 1 - - with SqlCounter(query_count=qc): + with SqlCounter(query_count=0): eval_snowpark_pandas_result( pd.Series(series), series, diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index 9437bb6a36c..628af787ac4 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -657,10 +657,10 @@ def test_concat_keys_with_none(df1, df2, axis): ) def test_concat_with_keys_and_names(df1, df2, names, name1, name2, axis): # One extra query to convert index to native pandas when creating df - with SqlCounter(query_count=0 if name1 is None or axis == 1 else 4, join_count=0): + with SqlCounter(query_count=0 if name1 is None or axis == 1 else 3, join_count=0): df1 = df1.rename_axis(name1, axis=axis) # One extra query to convert index to native pandas when creating df - with SqlCounter(query_count=0 if name2 is None or axis == 1 else 4, join_count=0): + with SqlCounter(query_count=0 if name2 is None or axis == 1 else 3, join_count=0): df2 = df2.rename_axis(name2, axis=axis) expected_join_count = ( diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py index 07fb4aefebf..a0ac55958a9 100644 --- a/tests/integ/modin/tools/test_to_datetime.py +++ b/tests/integ/modin/tools/test_to_datetime.py @@ -570,7 +570,7 @@ def test_to_datetime_mixed_datetime_and_string(self): pytest.param("US/Central"), ], ) - @sql_count_checker(query_count=3) + @sql_count_checker(query_count=2) def test_to_datetime_dtarr(self, tz): # DatetimeArray dti = native_pd.date_range("1965-04-03", periods=19, freq="2W", tz=tz) From d50dba638149d82ddda48b57e7d7331721d36422 Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Sat, 10 Aug 2024 15:14:52 -0700 Subject: [PATCH 4/4] Fix more query counts --- .../snowpark/modin/plugin/extensions/index.py | 1 + tests/integ/modin/frame/test_axis.py | 21 ++++++++----------- .../integ/modin/frame/test_drop_duplicates.py | 3 +-- tests/integ/modin/frame/test_getitem.py | 4 ++-- .../modin/frame/test_nlargest_nsmallest.py | 2 +- tests/integ/modin/frame/test_set_index.py | 21 +++++-------------- tests/integ/modin/series/test_axis.py | 20 +++++++++--------- tests/integ/modin/series/test_getitem.py | 2 +- tests/integ/modin/series/test_loc.py | 3 --- tests/integ/modin/series/test_setitem.py | 8 +++---- tests/integ/modin/test_concat.py | 2 -- tests/integ/modin/test_telemetry.py | 2 +- 12 files changed, 34 insertions(+), 55 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py index a3b4265708a..82acc627601 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/index.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py @@ -2383,6 +2383,7 @@ def _to_datetime( origin: DateTimeOrigin = "unix", ) -> Index: """ + Convert index to DatetimeIndex. Args: errors: {'ignore', 'raise', 'coerce'}, default 'raise' If 'raise', then invalid parsing will raise an exception. diff --git a/tests/integ/modin/frame/test_axis.py b/tests/integ/modin/frame/test_axis.py index b253906ba53..28cf55dee40 100644 --- a/tests/integ/modin/frame/test_axis.py +++ b/tests/integ/modin/frame/test_axis.py @@ -81,8 +81,7 @@ def test_index(test_df): @pytest.mark.parametrize("test_df", test_dfs) -# One extra query to convert lazy index to series to set index -@sql_count_checker(query_count=9, join_count=3) +@sql_count_checker(query_count=8, join_count=3) def test_set_and_assign_index(test_df): def assign_index(df, keys): df.index = keys @@ -290,7 +289,7 @@ def test_duplicate_labels_assignment(): native_pd.DataFrame({"A": [3.14, 1.414, 1.732], "B": [9.8, 1.0, 0]}), "rows", [None] * 3, - 6, + 5, 2, ], [ # Labels is a MultiIndex from tuples. @@ -307,7 +306,7 @@ def test_duplicate_labels_assignment(): native_pd.DataFrame({"A": ["foo", "bar", 3], "B": [4, "baz", 6]}), 0, {1: "c", 2: "b", 3: "a"}, - 6, + 5, 2, ], [ @@ -327,7 +326,7 @@ def test_duplicate_labels_assignment(): ), 0, ['"row 1"', "row 2"], - 6, + 5, 2, ], [ @@ -340,7 +339,7 @@ def test_duplicate_labels_assignment(): ), "rows", list(range(10)), - 6, + 5, 2, ], [ @@ -875,8 +874,7 @@ def test_set_axis_df_raises_value_error_diff_error_msg( ): # Should raise a ValueError if the labels for row-like axis are invalid. # The error messages do not match native pandas. - # one extra query to convert to native pandas in series constructor - with SqlCounter(query_count=2 if isinstance(labels, native_pd.MultiIndex) else 3): + with SqlCounter(query_count=2): with pytest.raises(ValueError, match=error_msg): pd.DataFrame(native_df).set_axis(labels, axis=axis) @@ -894,7 +892,7 @@ def test_set_axis_df_raises_type_error_diff_error_msg( pd.DataFrame(native_df).set_axis(labels, axis=axis) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_df_set_axis_copy_true(caplog): # Test that warning is raised when copy argument is used. native_df = native_pd.DataFrame({"A": [1.25], "B": [3]}) @@ -935,12 +933,11 @@ def test_df_set_axis_with_quoted_index(): # check first that operation result is the same snow_df = pd.DataFrame(data) native_df = native_pd.DataFrame(data) - # One extra query to convert to native pandas in series constructor - with SqlCounter(query_count=4): + with SqlCounter(query_count=3): eval_snowpark_pandas_result(snow_df, native_df, helper) # then, explicitly compare axes - with SqlCounter(query_count=2): + with SqlCounter(query_count=1): ans = helper(snow_df) native_ans = helper(native_df) diff --git a/tests/integ/modin/frame/test_drop_duplicates.py b/tests/integ/modin/frame/test_drop_duplicates.py index 3cf38708038..35c4a8edb05 100644 --- a/tests/integ/modin/frame/test_drop_duplicates.py +++ b/tests/integ/modin/frame/test_drop_duplicates.py @@ -64,8 +64,7 @@ def test_drop_duplicates(subset, keep, ignore_index): query_count = 1 join_count = 2 if ignore_index is True: - # One extra query to convert index to native pandas in series constructor - query_count += 3 + query_count += 2 join_count += 3 with SqlCounter(query_count=query_count, join_count=join_count): assert_frame_equal( diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py index 746a8aa6550..fd4ede77d77 100644 --- a/tests/integ/modin/frame/test_getitem.py +++ b/tests/integ/modin/frame/test_getitem.py @@ -39,9 +39,9 @@ def test_df_getitem_with_boolean_list_like( key, default_index_snowpark_pandas_df, default_index_native_df ): - # one added query to convert to native pandas and 2 added queries for series initialization + # one added query to convert to native pandas and 1 added query for series initialization with SqlCounter( - query_count=4 if isinstance(key, native_pd.Index) else 1, join_count=1 + query_count=3 if isinstance(key, native_pd.Index) else 1, join_count=1 ): # df[boolean list-like key] is the same as df.loc[:, boolean list-like key] if isinstance(key, native_pd.Index): diff --git a/tests/integ/modin/frame/test_nlargest_nsmallest.py b/tests/integ/modin/frame/test_nlargest_nsmallest.py index fa57ddeadd2..3b6318179f2 100644 --- a/tests/integ/modin/frame/test_nlargest_nsmallest.py +++ b/tests/integ/modin/frame/test_nlargest_nsmallest.py @@ -54,7 +54,7 @@ def test_nlargest_nsmallest_large_n(snow_df, native_df, method): ) -@sql_count_checker(query_count=5, join_count=1) +@sql_count_checker(query_count=4, join_count=1) def test_nlargest_nsmallest_overlapping_index_name(snow_df, native_df, method): snow_df = snow_df.rename_axis("A") native_df = native_df.rename_axis("A") diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py index e0088673282..ae035f0b3a4 100644 --- a/tests/integ/modin/frame/test_set_index.py +++ b/tests/integ/modin/frame/test_set_index.py @@ -80,8 +80,7 @@ def test_set_index_multiindex_columns(snow_df): ) -# One extra query to convert to native pandas to create series to set index -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_set_index_negative(snow_df, native_df): index = pd.Index([1, 2]) native_index = native_pd.Index([1, 2]) @@ -122,7 +121,7 @@ def test_set_index_names(snow_df): # Verify name from input index is set. index = pd.Index([1, 2, 0]) index.names = ["iname"] - with SqlCounter(query_count=3): + with SqlCounter(query_count=2): assert snow_df.set_index(index).index.names == ["iname"] # Verify names from input multiindex are set. @@ -229,11 +228,8 @@ def test_set_index_pass_single_array(obj_type, drop, append, native_df): ) else: expected_query_count = 3 - if obj_type == pd.Series: + if obj_type == pd.Series or obj_type == pd.Index: expected_query_count = 4 - # two extra queries, one to convert to native pandas (like series case) and one to create the series to set index - if obj_type == pd.Index: - expected_query_count = 5 with SqlCounter(query_count=expected_query_count, join_count=1): eval_snowpark_pandas_result( snow_df, @@ -268,11 +264,7 @@ def test_set_index_pass_arrays(obj_type, drop, append, native_df): "a", key.to_pandas() if isinstance(key, (pd.Series, pd.Index)) else key, ] - query_count = 3 - # one extra query to convert to series to set index - if obj_type == pd.Index: - query_count = 4 - with SqlCounter(query_count=query_count, join_count=1): + with SqlCounter(query_count=3, join_count=1): eval_snowpark_pandas_result( snow_df, native_df, @@ -433,7 +425,7 @@ def test_set_index_raise_on_len(length, obj_type, drop, append, native_df): msg = "Length mismatch: Expected 3 rows, received array of length.*" # wrong length directly # one extra query to create the series to set index - with SqlCounter(query_count=3 if obj_type == native_pd.Index else 2): + with SqlCounter(query_count=2): eval_snowpark_pandas_result( snow_df, native_df, @@ -451,9 +443,6 @@ def test_set_index_raise_on_len(length, obj_type, drop, append, native_df): expected_query_count = 1 if obj_type == native_pd.Series: expected_query_count = 0 - # one extra query to convert to native pandas to create the series to set index - if obj_type == native_pd.Index: - expected_query_count = 2 keys = ["a", key] native_keys = ["a", native_key] with SqlCounter(query_count=expected_query_count): diff --git a/tests/integ/modin/series/test_axis.py b/tests/integ/modin/series/test_axis.py index af00662f8db..d099272d6e9 100644 --- a/tests/integ/modin/series/test_axis.py +++ b/tests/integ/modin/series/test_axis.py @@ -30,7 +30,7 @@ native_pd.Series({"A": [1, 2, 3], 5 / 6: [4, 5, 6]}), "index", [None] * 2, - 4, + 3, 1, ], [ @@ -44,7 +44,7 @@ ), "index", ["iccanobif", "serauqs", "semirp"], - 4, + 3, 1, ], [ @@ -58,7 +58,7 @@ ), "index", native_pd.Series(["iccanobif", "serauqs", "semirp"], name="reverse names"), - 4, + 3, 1, ], [ @@ -73,7 +73,7 @@ ), 0, native_pd.Index([99, 999, 9999, 99999, 999999]), - 4, + 3, 1, ], [ @@ -88,7 +88,7 @@ ), 0, native_pd.Index([99, 999, 9999, 99999, 999999], name="index with name"), - 4, + 3, 1, ], [ @@ -104,7 +104,7 @@ ), 0, native_pd.Index([99, 999, 9999, 99999, 999999], name="index with name"), - 4, + 3, 1, ], [ # Index is a MultiIndex from tuples. @@ -165,14 +165,14 @@ native_pd.Series({"A": ["foo", "bar", 3], "B": [4, "baz", 6]}), "index", {1: 1, 2: 2}, - 4, + 3, 1, ], [ native_pd.Series({"A": ["foo", "bar", 3], "B": [4, "baz", 6]}), "rows", {1, 2}, - 4, + 3, 1, ], ] @@ -440,7 +440,7 @@ def test_set_axis_series_raises_value_error_diff_error_msg( ): # Should raise a ValueError if length of labels passed in # don't match the number of rows. - with SqlCounter(query_count=2 if isinstance(labels, native_pd.MultiIndex) else 3): + with SqlCounter(query_count=2): with pytest.raises(ValueError, match=error_msg): pd.Series(ser).set_axis(labels, axis=axis) @@ -474,7 +474,7 @@ def test_set_axis_series_raises_type_error(ser, axis, labels, error_msg): pd.Series(ser).set_axis(labels, axis=axis) -@sql_count_checker(query_count=4, join_count=1) +@sql_count_checker(query_count=3, join_count=1) def test_series_set_axis_copy_true(caplog): # Test that warning is raised when copy argument is used. series = native_pd.Series([1.25]) diff --git a/tests/integ/modin/series/test_getitem.py b/tests/integ/modin/series/test_getitem.py index 3c297f32d0b..0ea84425d18 100644 --- a/tests/integ/modin/series/test_getitem.py +++ b/tests/integ/modin/series/test_getitem.py @@ -46,7 +46,7 @@ def getitem_helper(ser): _key, _ser = snow_key, ser return _ser[_key] - with SqlCounter(query_count=2 if isinstance(key, native_pd.Index) else 1): + with SqlCounter(query_count=1): eval_snowpark_pandas_result( default_index_snowpark_pandas_series, default_index_native_series, diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 21fbf6aeafa..aa16a841f27 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -477,9 +477,6 @@ def type_convert(key, is_snow_type): return s.loc[type_convert(native_series_key, isinstance(s, pd.Series))] # default index - # Note: here number of queries are 2 due to the data type of the series is variant and to_pandas needs to call - # typeof to get the value types - # TODO: SNOW-933782 optimize to_pandas for variant columns to only fire one query with SqlCounter(query_count=1, join_count=1): eval_snowpark_pandas_result( default_index_snowpark_pandas_series, diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py index 407e93c6a12..50405643bc3 100644 --- a/tests/integ/modin/series/test_setitem.py +++ b/tests/integ/modin/series/test_setitem.py @@ -1560,7 +1560,7 @@ def test_series_setitem_with_empty_key_and_empty_item_negative( else: snowpark_key = key - with SqlCounter(query_count=1 if isinstance(key, native_pd.Index) else 0): + with SqlCounter(query_count=0): err_msg = "The length of the value/item to set is empty" with pytest.raises(ValueError, match=err_msg): @@ -1601,7 +1601,7 @@ def test_series_setitem_with_empty_key_and_empty_series_item( else: snowpark_key = key - with SqlCounter(query_count=2 if isinstance(key, native_pd.Index) else 1): + with SqlCounter(query_count=1): native_ser[key] = item snowpark_ser[ pd.Series(snowpark_key) @@ -1649,9 +1649,7 @@ def test_series_setitem_with_empty_key_and_scalar_item( else: snowpark_key = key - with SqlCounter( - query_count=2 if isinstance(key, native_pd.Index) else 1, join_count=2 - ): + with SqlCounter(query_count=1, join_count=2): native_ser[key] = item snowpark_ser[ pd.Series(snowpark_key) diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py index 628af787ac4..1049d5ea21b 100644 --- a/tests/integ/modin/test_concat.py +++ b/tests/integ/modin/test_concat.py @@ -656,10 +656,8 @@ def test_concat_keys_with_none(df1, df2, axis): "name1, name2", [("one", "two"), ("one", None), (None, "two"), (None, None)] ) def test_concat_with_keys_and_names(df1, df2, names, name1, name2, axis): - # One extra query to convert index to native pandas when creating df with SqlCounter(query_count=0 if name1 is None or axis == 1 else 3, join_count=0): df1 = df1.rename_axis(name1, axis=axis) - # One extra query to convert index to native pandas when creating df with SqlCounter(query_count=0 if name2 is None or axis == 1 else 3, join_count=0): df2 = df2.rename_axis(name2, axis=axis) diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index c908b56c56a..ba20286579a 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -474,7 +474,7 @@ def test_telemetry_private_method(name, method, expected_query_count): assert data["api_calls"] == [{"name": f"DataFrame.DataFrame.{name}"}] -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=2) def test_telemetry_property_index(): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df._query_compiler.snowpark_pandas_api_calls.clear()