From 386e7d5bdfc01be389e4f7d1370b3d980e4a7454 Mon Sep 17 00:00:00 2001
From: Naresh Kumar <naresh.kumar@snowflake.com>
Date: Wed, 7 Aug 2024 10:55:05 -0700
Subject: [PATCH 1/4] SNOW-1486910: Return DatetimeIndex from to_datetime and
 date_range

---
 CHANGELOG.md                                  |   2 +
 .../snowpark/modin/pandas/general.py          | 167 ++++++------------
 .../compiler/snowflake_query_compiler.py      |  19 +-
 .../snowpark/modin/plugin/extensions/index.py |  64 ++++++-
 tests/integ/modin/tools/test_date_range.py    |   8 +-
 tests/integ/modin/tools/test_to_datetime.py   |  50 +++---
 tests/integ/modin/utils.py                    |   3 +-
 7 files changed, 166 insertions(+), 147 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dc28d76f3b8..4339973da47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -79,6 +79,8 @@
 ### Behavior change
 - `Dataframe.columns` now returns native pandas Index object instead of Snowpark Index object.
 - Refactor and introduce `query_compiler` argument in `Index` constructor to create `Index` from query compiler.
+- `pd.to_datetime` now returns a DatetimeIndex object instead of a Series object.
+- `pd.date_range` now returns a DatetimeIndex object instead of a Series object.
 
 ## 1.20.0 (2024-07-17)
 
diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 468b31f5367..417d1edca5f 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1572,10 +1572,7 @@ def to_datetime(
 
     >>> pd.to_datetime([1, 2, 3], unit='D',
     ...                origin=pd.Timestamp('1960-01-01'))
-    0   1960-01-02
-    1   1960-01-03
-    2   1960-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
 
 
     **Non-convertible date/times**
@@ -1589,9 +1586,7 @@ def to_datetime(
     in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
 
     >>> pd.to_datetime(['13000101', 'abc'], format='%Y%m%d', errors='coerce')
-    0   NaT
-    1   NaT
-    dtype: datetime64[ns]
+    DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None)
 
 
     .. _to_datetime_tz_examples:
@@ -1603,55 +1598,41 @@ def to_datetime(
     - Timezone-naive inputs are converted to timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series`:
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15'])
-    0   2018-10-26 12:00:00
-    1   2018-10-26 13:00:15
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None)
 
     - Timezone-aware inputs *with constant time offset* are still converted to
       timezone-naive :class:`~snowflake.snowpark.modin.pandas.Series` by default.
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'])
-    0   2018-10-26 12:00:00
-    1   2018-10-26 13:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:00'], dtype='datetime64[ns]', freq=None)
 
     - Use right format to convert to timezone-aware type (Note that when call Snowpark
       pandas API to_pandas() the timezone-aware output will always be converted to session timezone):
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0500', '2018-10-26 13:00:00 -0500'], format="%Y-%m-%d %H:%M:%S %z")
-    0   2018-10-26 10:00:00-07:00
-    1   2018-10-26 11:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 10:00:00-07:00', '2018-10-26 11:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs *with mixed time offsets* (for example
       issued from a timezone with daylight savings, such as Europe/Paris):
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'])
-    0   2020-10-25 02:00:00
-    1   2020-10-25 04:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2020-10-25 02:00:00', '2020-10-25 04:00:00'], dtype='datetime64[ns]', freq=None)
 
     >>> pd.to_datetime(['2020-10-25 02:00:00 +0200', '2020-10-25 04:00:00 +0100'], format="%Y-%m-%d %H:%M:%S %z")
-    0   2020-10-24 17:00:00-07:00
-    1   2020-10-24 20:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2020-10-24 17:00:00-07:00', '2020-10-24 20:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     Setting ``utc=True`` makes sure always convert to timezone-aware outputs:
 
     - Timezone-naive inputs are *localized* based on the session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
-    0   2018-10-26 12:00:00-07:00
-    1   2018-10-26 13:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 12:00:00-07:00', '2018-10-26 13:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
 
     - Timezone-aware inputs are *converted* to session timezone
 
     >>> pd.to_datetime(['2018-10-26 12:00:00 -0530', '2018-10-26 12:00:00 -0500'],
     ...                utc=True)
-    0   2018-10-26 10:30:00-07:00
-    1   2018-10-26 10:00:00-07:00
-    dtype: datetime64[ns, America/Los_Angeles]
+    DatetimeIndex(['2018-10-26 10:30:00-07:00', '2018-10-26 10:00:00-07:00'], dtype='datetime64[ns, America/Los_Angeles]', freq=None)
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
     raise_if_native_pandas_objects(arg)
@@ -1668,22 +1649,14 @@ def to_datetime(
             message="cache parameter is ignored with Snowflake backend, i.e., no caching will be applied",
         )
     arg_is_scalar = is_scalar(arg)
-    # handle empty array, list, dict
-    if not arg_is_scalar and not isinstance(arg, (DataFrame, Series)) and len(arg) == 0:
-        return arg if isinstance(arg, Series) else Series(arg)  # always return a Series
-    if not isinstance(arg, (DataFrame, Series)):
-        # turn dictionary like arg into DataFrame and list like or scalar to Series
-        if isinstance(arg, dict):
-            arg = DataFrame(arg)  # pragma: no cover
-        else:
-            name = None
-            # keep index name
-            if isinstance(arg, pd.Index):
-                name = arg.name
-            arg = Series(arg)
-            arg.name = name
-
-    series = arg._to_datetime(
+
+    if not isinstance(arg, (DataFrame, Series, pd.Index)):
+        # Turn dictionary like arg into pd.DataFrame and list-like or scalar to
+        # pd.Index.
+        arg = [arg] if arg_is_scalar else arg
+        arg = DataFrame(arg) if isinstance(arg, dict) else pd.Index(arg)
+
+    series_or_index = arg._to_datetime(
         errors=errors,
         dayfirst=dayfirst,
         yearfirst=yearfirst,
@@ -1697,9 +1670,10 @@ def to_datetime(
     if arg_is_scalar:
         # Calling squeeze directly on Snowpark pandas Series makes an unnecessary
         # count sql call. To avoid that we convert Snowpark pandas Series to Native
-        # pandas seris first.
-        return series.to_pandas().squeeze()
-    return series
+        # pandas series first.
+        # Note: When arg_is_scalar is True 'series_or_index' is always an Index.
+        return series_or_index.to_series().to_pandas().squeeze()
+    return series_or_index
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
@@ -2004,9 +1978,9 @@ def date_range(
     name: Hashable | None = None,
     inclusive: IntervalClosedType = "both",
     **kwargs,
-) -> Series:
+) -> pd.DatetimeIndex:
     """
-    Return a fixed frequency series.
+    Return a fixed frequency DatetimeIndex.
 
     Returns the range of equally spaced time points (where the difference between any
     two adjacent points is specified by the given frequency) such that they all
@@ -2078,109 +2052,72 @@ def date_range(
     Specify `start` and `end`, with the default daily frequency.
 
     >>> pd.date_range(start='1/1/2018', end='1/08/2018')
-    0   2018-01-01
-    1   2018-01-02
-    2   2018-01-03
-    3   2018-01-04
-    4   2018-01-05
-    5   2018-01-06
-    6   2018-01-07
-    7   2018-01-08
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `start` and `periods`, the number of periods (days).
 
     >>> pd.date_range(start='1/1/2018', periods=8)
-    0   2018-01-01
-    1   2018-01-02
-    2   2018-01-03
-    3   2018-01-04
-    4   2018-01-05
-    5   2018-01-06
-    6   2018-01-07
-    7   2018-01-08
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
+                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `end` and `periods`, the number of periods (days).
 
     >>> pd.date_range(end='1/1/2018', periods=8)
-    0   2017-12-25
-    1   2017-12-26
-    2   2017-12-27
-    3   2017-12-28
-    4   2017-12-29
-    5   2017-12-30
-    6   2017-12-31
-    7   2018-01-01
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
+                   '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
+                  dtype='datetime64[ns]', freq=None)
 
     Specify `start`, `end`, and `periods`; the frequency is generated
     automatically (linearly spaced).
 
     >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
-    0   2018-04-24 00:00:00
-    1   2018-04-25 12:00:00
-    2   2018-04-27 00:00:00
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
+                   '2018-04-27 00:00:00'],
+                  dtype='datetime64[ns]', freq=None)
+
 
     **Other Parameters**
 
     Changed the `freq` (frequency) to ``'ME'`` (month end frequency).
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq='ME')
-    0   2018-01-31
-    1   2018-02-28
-    2   2018-03-31
-    3   2018-04-30
-    4   2018-05-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
+                   '2018-05-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     Multiples are allowed
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME')
-    0   2018-01-31
-    1   2018-04-30
-    2   2018-07-31
-    3   2018-10-31
-    4   2019-01-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     `freq` can also be specified as an Offset object.
 
     >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
-    0   2018-01-31
-    1   2018-04-30
-    2   2018-07-31
-    3   2018-10-31
-    4   2019-01-31
-    dtype: datetime64[ns]
+    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
+                   '2019-01-31'],
+                  dtype='datetime64[ns]', freq=None)
 
     `inclusive` controls whether to include `start` and `end` that are on the
     boundary. The default, "both", includes boundary points on either end.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both")
-    0   2017-01-01
-    1   2017-01-02
-    2   2017-01-03
-    3   2017-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
 
     Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left')
-    0   2017-01-01
-    1   2017-01-02
-    2   2017-01-03
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq=None)
 
     Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and
     similarly ``inclusive='neither'`` will exclude both `start` and `end`.
 
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
-    0   2017-01-02
-    1   2017-01-03
-    2   2017-01-04
-    dtype: datetime64[ns]
+    DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq=None)
     """
     # TODO: SNOW-1063345: Modin upgrade - modin.pandas functions in general.py
 
@@ -2229,9 +2166,11 @@ def date_range(
         left_inclusive=left_inclusive,
         right_inclusive=right_inclusive,
     )
-    s = Series(query_compiler=qc)
-    s.name = name
-    return s
+    # Set date range as index column.
+    qc = qc.set_index_from_columns(qc.columns.tolist())
+    # Set index column name.
+    qc = qc.set_index_names([name])
+    return pd.DatetimeIndex(data=qc)
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 22ac143dfe2..17fcf35f6d7 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -5551,7 +5551,7 @@ def set_index_from_columns(
         for (
             ids
         ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
-            keys
+            keys, include_index=False
         ):
             # Error checking for missing labels is already done in frontend layer.
             index_column_snowflake_quoted_identifiers.append(ids[0])
@@ -5870,6 +5870,7 @@ def series_to_datetime(
         unit: Optional[str] = None,
         infer_datetime_format: Union[lib.NoDefault, bool] = lib.no_default,
         origin: DateTimeOrigin = "unix",
+        include_index: bool = False,
     ) -> "SnowflakeQueryCompiler":
         """
         Convert series to the datetime dtype.
@@ -5884,6 +5885,7 @@ def series_to_datetime(
             unit: to_datetime unit
             infer_datetime_format: to_datetime infer_datetime_format
             origin: to_datetime origin
+            include_index: If True, also convert index columns to datetime.
         Returns:
             SnowflakeQueryCompiler:
             QueryCompiler with a single data column converted to datetime dtype.
@@ -5896,12 +5898,16 @@ def series_to_datetime(
             to_snowflake_timestamp_format(format) if format is not None else None
         )
         id_to_sf_type_map = self._modin_frame.quoted_identifier_to_snowflake_type()
-        col_id = self._modin_frame.data_column_snowflake_quoted_identifiers[0]
-        sf_type = id_to_sf_type_map[col_id]
+        col_ids = []
+        if include_index:
+            col_ids = self._modin_frame.index_column_snowflake_quoted_identifiers
+        col_ids.extend(self._modin_frame.data_column_snowflake_quoted_identifiers)
 
-        if isinstance(sf_type, BooleanType):
-            # bool is not allowed in to_datetime (but note that bool is allowed by astype)
-            raise TypeError("dtype bool cannot be converted to datetime64[ns]")
+        for col_id in col_ids:
+            sf_type = id_to_sf_type_map[col_id]
+            if isinstance(sf_type, BooleanType):
+                # bool is not allowed in to_datetime (but note that bool is allowed by astype)
+                raise TypeError("dtype bool cannot be converted to datetime64[ns]")
 
         to_datetime_cols = {
             col_id: generate_timestamp_col(
@@ -5913,6 +5919,7 @@ def series_to_datetime(
                 unit="ns" if unit is None else unit,
                 origin=origin,
             )
+            for col_id in col_ids
         }
         return SnowflakeQueryCompiler(
             self._modin_frame.update_snowflake_quoted_identifiers_with_expressions(
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py
index ef94d7bb7ea..e11ac325f0d 100644
--- a/src/snowflake/snowpark/modin/plugin/extensions/index.py
+++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py
@@ -30,7 +30,7 @@
 import numpy as np
 import pandas as native_pd
 from pandas._libs import lib
-from pandas._typing import ArrayLike, DtypeObj, NaPosition
+from pandas._typing import ArrayLike, DateTimeErrorChoices, DtypeObj, NaPosition
 from pandas.core.arrays import ExtensionArray
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import is_datetime64_any_dtype, pandas_dtype
@@ -39,6 +39,7 @@
 from snowflake.snowpark.modin.pandas.base import BasePandasDataset
 from snowflake.snowpark.modin.pandas.utils import try_convert_index_to_native
 from snowflake.snowpark.modin.plugin._internal.telemetry import TelemetryMeta
+from snowflake.snowpark.modin.plugin._internal.timestamp_utils import DateTimeOrigin
 from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import (
     SnowflakeQueryCompiler,
 )
@@ -2368,3 +2369,64 @@ def str(self) -> native_pd.core.strings.accessor.StringMethods:
         """
         WarningMessage.index_to_pandas_warning("str")
         return self.to_pandas().str
+
+    def _to_datetime(
+        self,
+        errors: DateTimeErrorChoices = "raise",
+        dayfirst: bool = False,
+        yearfirst: bool = False,
+        utc: bool = False,
+        format: str = None,
+        exact: bool | lib.NoDefault = lib.no_default,
+        unit: str = None,
+        infer_datetime_format: bool | lib.NoDefault = lib.no_default,
+        origin: DateTimeOrigin = "unix",
+    ) -> Index:
+        """
+        Args:
+            errors: {'ignore', 'raise', 'coerce'}, default 'raise'
+              If 'raise', then invalid parsing will raise an exception.
+              If 'coerce', then invalid parsing will be set as NaT.
+              If 'ignore', then invalid parsing will return the input.
+            dayfirst: bool, default False
+              Specify a date parse order if arg is str or is list-like.
+            yearfirst: bool, default False
+              Specify a date parse order if arg is str or is list-like.
+            utc: bool, default False
+              Control timezone-related parsing, localization and conversion.
+            format: str, default None
+              The strftime to parse time
+            exact: bool, default True
+              Control how format is used:
+              True: require an exact format match.
+              False: allow the format to match anywhere in the target string.
+            unit: str, default 'ns'
+              The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer
+              or float number.
+            infer_datetime_format: bool, default False
+              If True and no format is given, attempt to infer the format of the \
+              datetime strings based on the first non-NaN element.
+            origin: scalar, default 'unix'
+              Define the reference date. The numeric values would be parsed as number
+              of units (defined by unit) since this reference date.
+
+        Returns:
+            DatetimeIndex
+        """
+        from snowflake.snowpark.modin.plugin.extensions.datetime_index import (
+            DatetimeIndex,
+        )
+
+        new_qc = self._query_compiler.series_to_datetime(
+            errors,
+            dayfirst,
+            yearfirst,
+            utc,
+            format,
+            exact,
+            unit,
+            infer_datetime_format,
+            origin,
+            include_index=True,
+        )
+        return DatetimeIndex(data=new_qc)
diff --git a/tests/integ/modin/tools/test_date_range.py b/tests/integ/modin/tools/test_date_range.py
index a1f3c4d0ba2..d26861442f4 100644
--- a/tests/integ/modin/tools/test_date_range.py
+++ b/tests/integ/modin/tools/test_date_range.py
@@ -63,7 +63,7 @@
 @sql_count_checker(query_count=1)
 def test_regular_range(kwargs):
     assert_snowpark_pandas_equal_to_pandas(
-        pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs))
+        pd.date_range(**kwargs), native_pd.date_range(**kwargs)
     )
 
 
@@ -125,7 +125,7 @@ def test_regular_range(kwargs):
 @sql_count_checker(query_count=1)
 def test_irregular_range(kwargs):
     assert_snowpark_pandas_equal_to_pandas(
-        pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs))
+        pd.date_range(**kwargs), native_pd.date_range(**kwargs)
     )
 
 
@@ -168,7 +168,7 @@ def test_without_freq(periods, inclusive):
         "inclusive": inclusive,
     }
     assert_snowpark_pandas_equal_to_pandas(
-        pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs))
+        pd.date_range(**kwargs), native_pd.date_range(**kwargs)
     )
 
 
@@ -193,7 +193,7 @@ def test_without_freq(periods, inclusive):
 def test_inclusive(kwargs, inclusive):
     kwargs.update({"inclusive": inclusive})
     assert_snowpark_pandas_equal_to_pandas(
-        pd.date_range(**kwargs), native_pd.Series(native_pd.date_range(**kwargs))
+        pd.date_range(**kwargs), native_pd.date_range(**kwargs)
     )
 
 
diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py
index d08495b31e9..07fb4aefebf 100644
--- a/tests/integ/modin/tools/test_to_datetime.py
+++ b/tests/integ/modin/tools/test_to_datetime.py
@@ -26,6 +26,7 @@
 )
 from tests.integ.modin.sql_counter import sql_count_checker
 from tests.integ.modin.utils import (
+    assert_index_equal,
     assert_series_equal,
     assert_snowpark_pandas_equal_to_pandas,
     eval_snowpark_pandas_result,
@@ -58,10 +59,10 @@ def test_to_datetime_readonly(self, readonly):
         if readonly:
             arr.setflags(write=False)
         result = to_datetime(arr)
-        expected = Series([], dtype=object)
-        assert_series_equal(result, expected)
+        expected = pd.DatetimeIndex([])
+        assert_index_equal(result, expected)
 
-    @pytest.mark.parametrize("box", [Series, native_pd.Index])
+    @pytest.mark.parametrize("box", [Series, pd.Index])
     @pytest.mark.parametrize(
         "format, expected",
         [
@@ -79,11 +80,18 @@ def test_to_datetime_readonly(self, readonly):
     def test_to_datetime_format(self, cache, box, format, expected):
         values = box(["1/1/2000", "1/2/2000", "1/3/2000"])
         result = to_datetime(values, format=format, cache=cache)
-        expected = Series(expected)
-        assert_series_equal(result, expected)
+        expected = box(expected)
+        if box is Series:
+            assert_series_equal(result, expected)
+        else:
+            assert_index_equal(result, expected)
+
         # cache values is ignored at Snowpark pandas so only test here to make sure it works as well
         result = to_datetime(values, format=format, cache=False)
-        assert_series_equal(result, expected)
+        if box is Series:
+            assert_series_equal(result, expected)
+        else:
+            assert_index_equal(result, expected)
 
     @pytest.mark.parametrize(
         "arg, expected, format",
@@ -235,9 +243,9 @@ def test_to_datetime_format_YYYYMMDD_with_none(self, input_s):
         # GH 30011
         # format='yyyymmdd'
         # with None
-        expected = Series([Timestamp("19801222"), Timestamp("20010112"), NaT])
-        result = Series(to_datetime(input_s, format="%Y%m%d"))
-        assert_series_equal(result, expected)
+        expected = pd.DatetimeIndex([Timestamp("19801222"), Timestamp("20010112"), NaT])
+        result = to_datetime(input_s, format="%Y%m%d")
+        assert_index_equal(result, expected)
 
     @pytest.mark.parametrize(
         "input, expected",
@@ -295,7 +303,7 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input, expected):
     def test_to_datetime_with_NA(self, data, format, expected):
         # GH#42957
         result = to_datetime(data, format=format)
-        assert_series_equal(result, Series(expected))
+        assert_index_equal(result, pd.DatetimeIndex(expected))
 
     @sql_count_checker(query_count=1, udf_count=0)
     def test_to_datetime_format_integer_year_only(self, cache):
@@ -509,10 +517,10 @@ def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self):
         fmt = "%Y-%m-%d %H:%M:%S %z"
 
         result = to_datetime(dates, format=fmt, utc=True)
-        expected = Series(expected_dates)
-        assert_series_equal(result, expected)
+        expected = pd.DatetimeIndex(expected_dates)
+        assert_index_equal(result, expected)
         result2 = to_datetime(dates, utc=True)
-        assert_series_equal(result2, expected)
+        assert_index_equal(result2, expected)
 
     @pytest.mark.parametrize(
         "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""]
@@ -529,8 +537,7 @@ def test_to_datetime_parse_timezone_malformed(self, offset):
         ):
             to_datetime([date], format=fmt).to_pandas()
 
-    # 2 extra queries to convert index to series
-    @sql_count_checker(query_count=2)
+    @sql_count_checker(query_count=0)
     def test_to_datetime_parse_timezone_keeps_name(self):
         # GH 21697
         fmt = "%Y-%m-%d %H:%M:%S %z"
@@ -547,14 +554,14 @@ def test_to_datetime_mixed_datetime_and_string(self):
         res = to_datetime(["2020-01-01 17:00:00 -0100", d2])
         # The input will become a series with variant type and the timezone is unaware by the Snowflake engine, so the
         # result ignores the timezone by default
-        expected = native_pd.Series(
+        expected = native_pd.DatetimeIndex(
             [datetime(2020, 1, 1, 17), datetime(2020, 1, 1, 18)]
         )
-        assert_series_equal(res, expected, check_dtype=False, check_index_type=False)
+        assert_index_equal(res, expected)
         # Set utc=True to make sure timezone aware in to_datetime
         res = to_datetime(["2020-01-01 17:00:00 -0100", d2], utc=True)
-        expected = pd.Series([d1, d2])
-        assert_series_equal(res, expected, check_dtype=False, check_index_type=False)
+        expected = pd.DatetimeIndex([d1, d2])
+        assert_index_equal(res, expected)
 
     @pytest.mark.parametrize(
         "tz",
@@ -563,13 +570,14 @@ def test_to_datetime_mixed_datetime_and_string(self):
             pytest.param("US/Central"),
         ],
     )
-    @sql_count_checker(query_count=2)
+    @sql_count_checker(query_count=3)
     def test_to_datetime_dtarr(self, tz):
         # DatetimeArray
         dti = native_pd.date_range("1965-04-03", periods=19, freq="2W", tz=tz)
         arr = DatetimeArray(dti)
+        # Use assert_series_equal to ignore timezone difference in dtype.
         assert_series_equal(
-            to_datetime(arr),
+            Series(to_datetime(arr)),
             Series(arr),
             check_dtype=False,
         )
diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py
index 6ec630024a3..06728f9e985 100644
--- a/tests/integ/modin/utils.py
+++ b/tests/integ/modin/utils.py
@@ -271,7 +271,8 @@ def assert_snowpark_pandas_equal_to_pandas(
         tm.assert_series_equal(snow_to_native, expected_pandas, **kwargs)
     else:
         assert isinstance(snow, Index)
-        kwargs.pop("check_dtype")
+        if "check_dtype" in kwargs:
+            kwargs.pop("check_dtype")
         if kwargs.pop("check_index_type"):
             kwargs.update(exact=False)
         tm.assert_index_equal(snow_to_native, expected_pandas, **kwargs)

From 2b5a7725b3b1ad63a62222ea122aab74eaa32a3b Mon Sep 17 00:00:00 2001
From: Naresh Kumar <naresh.kumar@snowflake.com>
Date: Thu, 8 Aug 2024 21:45:42 -0700
Subject: [PATCH 2/4] Fix tests and update doctests

---
 .../snowpark/modin/pandas/general.py          |  2 +-
 src/snowflake/snowpark/modin/pandas/series.py |  5 ++-
 .../compiler/snowflake_query_compiler.py      |  5 ++-
 .../modin/plugin/docstrings/series_utils.py   | 32 +++++++------------
 tests/integ/modin/frame/test_duplicated.py    |  2 +-
 tests/integ/modin/test_telemetry.py           |  2 +-
 6 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 417d1edca5f..4161d316b0e 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -2167,7 +2167,7 @@ def date_range(
         right_inclusive=right_inclusive,
     )
     # Set date range as index column.
-    qc = qc.set_index_from_columns(qc.columns.tolist())
+    qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False)
     # Set index column name.
     qc = qc.set_index_names([name])
     return pd.DatetimeIndex(data=qc)
diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py
index 12dc9d10972..a494b513de5 100644
--- a/src/snowflake/snowpark/modin/pandas/series.py
+++ b/src/snowflake/snowpark/modin/pandas/series.py
@@ -130,7 +130,10 @@ def __init__(
         # modified:
         # Engine.subscribe(_update_engine)
 
-        if isinstance(data, type(self)):
+        # Convert lazy index to Series without pulling the data to client.
+        if isinstance(data, pd.Index):
+            query_compiler = data.to_series(index=index, name=name)._query_compiler
+        elif isinstance(data, type(self)):
             query_compiler = data._query_compiler.copy()
             if index is not None:
                 if any(i not in data.index for i in index):
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
index 17fcf35f6d7..faa087464e5 100644
--- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
+++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -5530,6 +5530,7 @@ def set_index_from_columns(
         keys: list[Hashable],
         drop: Optional[bool] = True,
         append: Optional[bool] = False,
+        include_index: Optional[bool] = True,
     ) -> "SnowflakeQueryCompiler":
         """
         Create or update index (row labels) from a list of columns.
@@ -5542,6 +5543,8 @@ def set_index_from_columns(
             append: bool, default False
               Whether to add the columns in `keys` as new levels appended to the
               existing index.
+            include_index: bool, default True
+              Whether the keys can also include index column lables as well.
 
         Returns:
             A new QueryCompiler instance with updated index.
@@ -5551,7 +5554,7 @@ def set_index_from_columns(
         for (
             ids
         ) in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
-            keys, include_index=False
+            keys, include_index=include_index
         ):
             # Error checking for missing labels is already done in frontend layer.
             index_column_snowflake_quoted_identifiers.append(ids[0])
diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
index ba729cce6a9..c2519fc76d6 100644
--- a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
+++ b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
@@ -1353,7 +1353,7 @@ def dayofweek():
 
         Examples
         --------
-        >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D')
+        >>> s = pd.Series(pd.date_range('2016-12-31', '2017-01-08', freq='D'))
         >>> s
         0   2016-12-31
         1   2017-01-01
@@ -1390,7 +1390,7 @@ def dayofyear():
 
         Examples
         --------
-        >>> s = pd.to_datetime(["1/1/2020", "2/1/2020"])
+        >>> s = pd.Series(pd.to_datetime(["1/1/2020", "2/1/2020"]))
         >>> s
         0   2020-01-01
         1   2020-02-01
@@ -1670,9 +1670,8 @@ def is_leap_year():
         This method is available on Series with datetime values under the .dt accessor, and directly on DatetimeIndex.
 
         >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE")
-        >>> idx  # doctest: +SKIP
-        DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'],
-                    dtype='datetime64[ns]', freq='YE-DEC')
+        >>> idx
+        DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], dtype='datetime64[ns]', freq=None)
         >>> idx.is_leap_year  # doctest: +SKIP
         array([ True, False, False])
 
@@ -1688,7 +1687,6 @@ def is_leap_year():
         2    False
         dtype: bool
         """
-        # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex.
 
     @property
     def daysinmonth():
@@ -1762,22 +1760,19 @@ def month_name():
         dtype: object
 
         >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3)
-        >>> idx  # doctest: +SKIP
-        DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'],
-                    dtype='datetime64[ns]', freq='ME')
+        >>> idx
+        DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None)
         >>> idx.month_name()  # doctest: +SKIP
         Index(['January', 'February', 'March'], dtype='object')
 
         Using the locale parameter you can set a different locale language, for example: idx.month_name(locale='pt_BR.utf8') will return month names in Brazilian Portuguese language.
 
         >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3)
-        >>> idx  # doctest: +SKIP
-        DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'],
-                    dtype='datetime64[ns]', freq='ME')
+        >>> idx
+        DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq=None)
         >>> idx.month_name(locale='pt_BR.utf8')  # doctest: +SKIP
         Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object')
         """
-        # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex.
 
     def day_name():
         """
@@ -1808,22 +1803,19 @@ def day_name():
         dtype: object
 
         >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3)
-        >>> idx  # doctest: +SKIP
-        DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
-                    dtype='datetime64[ns]', freq='D')
+        >>> idx
+        DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None)
         >>> idx.day_name()  # doctest: +SKIP
         Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object')
 
         Using the locale parameter you can set a different locale language, for example: idx.day_name(locale='pt_BR.utf8') will return day names in Brazilian Portuguese language.
 
         >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3)
-        >>> idx  # doctest: +SKIP
-        DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
-                    dtype='datetime64[ns]', freq='D')
+        >>> idx
+        DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None)
         >>> idx.day_name(locale='pt_BR.utf8')  # doctest: +SKIP
         Index(['Segunda', 'Terça', 'Quarta'], dtype='object')
         """
-        # TODO(SNOW-1486910): Unskip when date_range returns DatetimeIndex.
 
     def total_seconds():
         pass
diff --git a/tests/integ/modin/frame/test_duplicated.py b/tests/integ/modin/frame/test_duplicated.py
index d0031adf0c4..e4c5d594ecc 100644
--- a/tests/integ/modin/frame/test_duplicated.py
+++ b/tests/integ/modin/frame/test_duplicated.py
@@ -93,7 +93,7 @@ def test_duplicated_on_empty_frame():
 
 @sql_count_checker(query_count=3, join_count=2)
 def test_frame_datetime64_duplicated():
-    dates = pd.date_range("2010-07-01", end="2010-08-05")
+    dates = pd.date_range("2010-07-01", end="2010-08-05").to_series()
 
     tst = pd.DataFrame({"symbol": "AAA", "date": dates})
     result = tst.duplicated(["date", "symbol"])
diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py
index c6c17313489..c908b56c56a 100644
--- a/tests/integ/modin/test_telemetry.py
+++ b/tests/integ/modin/test_telemetry.py
@@ -325,7 +325,7 @@ def sample_function(
 )
 @sql_count_checker(query_count=7, fallback_count=1, sproc_count=1)
 def test_property_methods_telemetry():
-    datetime_series = pd.date_range("2000-01-01", periods=3, freq="h")
+    datetime_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h"))
     ret_series = datetime_series.dt.timetz
     assert len(ret_series._query_compiler.snowpark_pandas_api_calls) == 1
     api_call = ret_series._query_compiler.snowpark_pandas_api_calls[0]

From 159c81bdcc19c1883c878c38c76dd44b9e71d421 Mon Sep 17 00:00:00 2001
From: Naresh Kumar <naresh.kumar@snowflake.com>
Date: Fri, 9 Aug 2024 12:20:42 -0700
Subject: [PATCH 3/4] Fix query counts and ctor updates

---
 .../snowpark/modin/pandas/general.py          |  7 ++--
 src/snowflake/snowpark/modin/pandas/series.py |  1 +
 .../snowpark/modin/plugin/extensions/index.py |  2 +-
 tests/integ/modin/frame/test_loc.py           | 39 +++++--------------
 tests/integ/modin/frame/test_set_index.py     |  6 +--
 tests/integ/modin/series/test_loc.py          | 24 +++++-------
 tests/integ/modin/test_concat.py              |  4 +-
 tests/integ/modin/tools/test_to_datetime.py   |  2 +-
 8 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/src/snowflake/snowpark/modin/pandas/general.py b/src/snowflake/snowpark/modin/pandas/general.py
index 4161d316b0e..af0369771bf 100644
--- a/src/snowflake/snowpark/modin/pandas/general.py
+++ b/src/snowflake/snowpark/modin/pandas/general.py
@@ -1352,7 +1352,7 @@ def to_datetime(
     infer_datetime_format: lib.NoDefault | bool = lib.no_default,
     origin: Any = "unix",
     cache: bool = True,
-) -> Series | DatetimeScalar | NaTType | None:
+) -> pd.DatetimeIndex | Series | DatetimeScalar | NaTType | None:
     """
     Convert argument to datetime.
 
@@ -1459,8 +1459,7 @@ def to_datetime(
         parsing):
 
         - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
-        - array-like: :class:`~snowflake.snowpark.modin.pandas.Series` with :class:`datetime64` dtype containing
-          :class:`datetime.datetime` (or
+        - array-like: :class:`~snowflake.snowpark.modin.pandas.DatetimeIndex` (or
           :class: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`object` dtype containing
           :class:`datetime.datetime`)
         - Series: :class:`~snowflake.snowpark.modin.pandas.Series` of :class:`datetime64` dtype (or
@@ -2170,7 +2169,7 @@ def date_range(
     qc = qc.set_index_from_columns(qc.columns.tolist(), include_index=False)
     # Set index column name.
     qc = qc.set_index_names([name])
-    return pd.DatetimeIndex(data=qc)
+    return pd.DatetimeIndex(query_compiler=qc)
 
 
 @snowpark_pandas_telemetry_standalone_function_decorator
diff --git a/src/snowflake/snowpark/modin/pandas/series.py b/src/snowflake/snowpark/modin/pandas/series.py
index a494b513de5..f268a21306b 100644
--- a/src/snowflake/snowpark/modin/pandas/series.py
+++ b/src/snowflake/snowpark/modin/pandas/series.py
@@ -133,6 +133,7 @@ def __init__(
         # Convert lazy index to Series without pulling the data to client.
         if isinstance(data, pd.Index):
             query_compiler = data.to_series(index=index, name=name)._query_compiler
+            query_compiler = query_compiler.reset_index(drop=True)
         elif isinstance(data, type(self)):
             query_compiler = data._query_compiler.copy()
             if index is not None:
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py
index e11ac325f0d..a3b4265708a 100644
--- a/src/snowflake/snowpark/modin/plugin/extensions/index.py
+++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py
@@ -2429,4 +2429,4 @@ def _to_datetime(
             origin,
             include_index=True,
         )
-        return DatetimeIndex(data=new_qc)
+        return DatetimeIndex(query_compiler=new_qc)
diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py
index f258f261b51..1012a0d3959 100644
--- a/tests/integ/modin/frame/test_loc.py
+++ b/tests/integ/modin/frame/test_loc.py
@@ -146,7 +146,7 @@ def test_df_loc_get_tuple_key(
         snow_row = row
 
     query_count = 1
-    if is_scalar(row) or isinstance(row, tuple) or isinstance(row, native_pd.Index):
+    if is_scalar(row) or isinstance(row, tuple):
         query_count = 2
 
     with SqlCounter(
@@ -945,11 +945,7 @@ def loc_set_helper(df):
             _row_key = key_converter(row_key, df)
             df.loc[_row_key] = pd.DataFrame(item)
 
-    with SqlCounter(
-        # one extra query to convert to series to set item
-        query_count=2 if key_type == "index" else 1,
-        join_count=expected_join_count,
-    ):
+    with SqlCounter(query_count=1, join_count=expected_join_count):
         eval_snowpark_pandas_result(
             pd.DataFrame(native_df), native_df, loc_set_helper, inplace=True
         )
@@ -971,11 +967,7 @@ def loc_set_helper(df):
             _row_key = key_converter(row_key, df)
             df.loc[_row_key, :] = pd.DataFrame(item)
 
-    with SqlCounter(
-        # one extra query to convert to series to set item
-        query_count=2 if key_type == "index" else 1,
-        join_count=expected_join_count,
-    ):
+    with SqlCounter(query_count=1, join_count=expected_join_count):
         eval_snowpark_pandas_result(
             pd.DataFrame(native_df), native_df, loc_set_helper, inplace=True
         )
@@ -1153,9 +1145,6 @@ def loc_set_helper(df):
     query_count, join_count = 1, 2
     if not all(isinstance(rk_val, bool) for rk_val in row_key):
         join_count += 2
-    # one extra query to convert to native pandas to initialize series and set item
-    if key_type == "index":
-        query_count = 2
     if isinstance(col_key, native_pd.Series):
         query_count += 1
     with SqlCounter(query_count=query_count, join_count=join_count):
@@ -1235,10 +1224,6 @@ def loc_set_helper(df):
     if isinstance(col_key, native_pd.Series):
         query_count += 1
 
-    # one extra query to convert to native pandas to initialize series and set item
-    if key_type == "index":
-        query_count += 1
-
     with SqlCounter(
         query_count=query_count,
         join_count=join_count,
@@ -1316,8 +1301,7 @@ def loc_set_helper(df):
         else:
             df.loc[row_key, :] = pd.DataFrame(item)
 
-    # one extra query to convert index to native pandas to initialize series and set item
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=4):
+    with SqlCounter(query_count=1, join_count=4):
         if item.index.has_duplicates:
             # pandas fails to update duplicated rows with duplicated item
             with pytest.raises(
@@ -1641,8 +1625,7 @@ def loc_helper(df):
 
         return _df.loc[_key]
 
-    # one extra query to convert index to native pandas to initialize series and set item
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_df,
             default_index_native_df,
@@ -1985,8 +1968,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None):
             )
 
     # default index
-    # one extra query to convert to series to set item
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_df,
             default_index_native_df,
@@ -2000,8 +1982,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None):
         "index"
     )
     non_default_index_snowpark_pandas_df = pd.DataFrame(non_default_index_native_df)
-    # one extra query to convert to series to set item
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             non_default_index_snowpark_pandas_df,
             non_default_index_native_df,
@@ -2021,8 +2002,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None):
         ]
     )
     dup_snowpandas_df = pd.DataFrame(dup_native_df)
-    # one extra query to convert to series to set item
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             dup_snowpandas_df,
             dup_native_df,
@@ -2047,8 +2027,7 @@ def loc_key_type_convert(key, is_snow_type, index_name=None):
         ]
     )
     dup_snowpandas_df = pd.DataFrame(dup_native_df)
-    # one extra query to convert to series to set item
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             dup_snowpandas_df,
             dup_native_df,
diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py
index 15566d630f1..e0088673282 100644
--- a/tests/integ/modin/frame/test_set_index.py
+++ b/tests/integ/modin/frame/test_set_index.py
@@ -320,11 +320,7 @@ def test_set_index_pass_arrays_duplicate(obj_type1, obj_type2, drop, append, nat
         obj_type2 = native_pd.Index
     native_keys = [obj_type1(array), obj_type2(array)]
 
-    query_count = 4
-    # one extra query per modin index to create the series and set index
-    query_count += 1 if obj_type1 == native_pd.Index else 0
-    query_count += 1 if obj_type2 == native_pd.Index else 0
-    with SqlCounter(query_count=query_count, join_count=2):
+    with SqlCounter(query_count=4, join_count=2):
         eval_snowpark_pandas_result(
             snow_df,
             native_df,
diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py
index 32c1bf64c4a..21fbf6aeafa 100644
--- a/tests/integ/modin/series/test_loc.py
+++ b/tests/integ/modin/series/test_loc.py
@@ -319,7 +319,7 @@ def loc_helper(ser):
         return _ser.loc[_key]
 
     default_index_series = pd.Series(default_index_native_series)
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             default_index_series,
             default_index_native_series,
@@ -480,7 +480,7 @@ def type_convert(key, is_snow_type):
     # Note: here number of queries are 2 due to the data type of the series is variant and to_pandas needs to call
     # typeof to get the value types
     # TODO: SNOW-933782 optimize to_pandas for variant columns to only fire one query
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_series,
             default_index_native_series,
@@ -497,7 +497,7 @@ def type_convert(key, is_snow_type):
     non_default_index_snowpark_pandas_series = pd.Series(
         non_default_index_native_series
     )
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             non_default_index_snowpark_pandas_series,
             non_default_index_native_series,
@@ -514,7 +514,7 @@ def type_convert(key, is_snow_type):
         ]
     )
     dup_snowpandas_series = pd.Series(dup_native_series)
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             dup_snowpandas_series,
             dup_native_series,
@@ -539,7 +539,7 @@ def type_convert(key, is_snow_type):
         ]
     )
     dup_snowpandas_series = pd.Series(dup_native_series)
-    with SqlCounter(query_count=2 if key_type == "index" else 1, join_count=1):
+    with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             dup_snowpandas_series,
             dup_native_series,
@@ -776,15 +776,15 @@ def loc_set_helper(s):
         s.loc[_row_key] = _item
 
     query_count = 1
-    # 6 extra queries: sum of two cases below
+    # 5 extra queries: sum of two cases below
     if item_type.startswith("index") and key_type.startswith("index"):
-        query_count = 7
+        query_count = 6
     # 4 extra queries: 1 query to convert item index to pandas in loc_set_helper, 2 for iter, and 1 for to_list
     elif item_type.startswith("index"):
         query_count = 5
-    # 2 extra queries: 1 query to convert key index to pandas in loc_set_helper and 1 to convert to series to setitem
+    # 1 extra query to convert to series to setitem
     elif key_type.startswith("index"):
-        query_count = 3
+        query_count = 2
     with SqlCounter(query_count=query_count, join_count=expected_join_count):
         eval_snowpark_pandas_result(
             pd.Series(series), series, loc_set_helper, inplace=True
@@ -834,11 +834,7 @@ def loc_set_helper(s):
         else:
             s.loc[pd.Series(row_key)] = pd.DataFrame(item)
 
-    qc = 0
-    if key_type == "index":
-        qc = 1
-
-    with SqlCounter(query_count=qc):
+    with SqlCounter(query_count=0):
         eval_snowpark_pandas_result(
             pd.Series(series),
             series,
diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py
index 9437bb6a36c..628af787ac4 100644
--- a/tests/integ/modin/test_concat.py
+++ b/tests/integ/modin/test_concat.py
@@ -657,10 +657,10 @@ def test_concat_keys_with_none(df1, df2, axis):
 )
 def test_concat_with_keys_and_names(df1, df2, names, name1, name2, axis):
     # One extra query to convert index to native pandas when creating df
-    with SqlCounter(query_count=0 if name1 is None or axis == 1 else 4, join_count=0):
+    with SqlCounter(query_count=0 if name1 is None or axis == 1 else 3, join_count=0):
         df1 = df1.rename_axis(name1, axis=axis)
     # One extra query to convert index to native pandas when creating df
-    with SqlCounter(query_count=0 if name2 is None or axis == 1 else 4, join_count=0):
+    with SqlCounter(query_count=0 if name2 is None or axis == 1 else 3, join_count=0):
         df2 = df2.rename_axis(name2, axis=axis)
 
     expected_join_count = (
diff --git a/tests/integ/modin/tools/test_to_datetime.py b/tests/integ/modin/tools/test_to_datetime.py
index 07fb4aefebf..a0ac55958a9 100644
--- a/tests/integ/modin/tools/test_to_datetime.py
+++ b/tests/integ/modin/tools/test_to_datetime.py
@@ -570,7 +570,7 @@ def test_to_datetime_mixed_datetime_and_string(self):
             pytest.param("US/Central"),
         ],
     )
-    @sql_count_checker(query_count=3)
+    @sql_count_checker(query_count=2)
     def test_to_datetime_dtarr(self, tz):
         # DatetimeArray
         dti = native_pd.date_range("1965-04-03", periods=19, freq="2W", tz=tz)

From d50dba638149d82ddda48b57e7d7331721d36422 Mon Sep 17 00:00:00 2001
From: Naresh Kumar <naresh.kumar@snowflake.com>
Date: Sat, 10 Aug 2024 15:14:52 -0700
Subject: [PATCH 4/4] Fix more query counts

---
 .../snowpark/modin/plugin/extensions/index.py |  1 +
 tests/integ/modin/frame/test_axis.py          | 21 ++++++++-----------
 .../integ/modin/frame/test_drop_duplicates.py |  3 +--
 tests/integ/modin/frame/test_getitem.py       |  4 ++--
 .../modin/frame/test_nlargest_nsmallest.py    |  2 +-
 tests/integ/modin/frame/test_set_index.py     | 21 +++++--------------
 tests/integ/modin/series/test_axis.py         | 20 +++++++++---------
 tests/integ/modin/series/test_getitem.py      |  2 +-
 tests/integ/modin/series/test_loc.py          |  3 ---
 tests/integ/modin/series/test_setitem.py      |  8 +++----
 tests/integ/modin/test_concat.py              |  2 --
 tests/integ/modin/test_telemetry.py           |  2 +-
 12 files changed, 34 insertions(+), 55 deletions(-)

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py
index a3b4265708a..82acc627601 100644
--- a/src/snowflake/snowpark/modin/plugin/extensions/index.py
+++ b/src/snowflake/snowpark/modin/plugin/extensions/index.py
@@ -2383,6 +2383,7 @@ def _to_datetime(
         origin: DateTimeOrigin = "unix",
     ) -> Index:
         """
+        Convert index to DatetimeIndex.
         Args:
             errors: {'ignore', 'raise', 'coerce'}, default 'raise'
               If 'raise', then invalid parsing will raise an exception.
diff --git a/tests/integ/modin/frame/test_axis.py b/tests/integ/modin/frame/test_axis.py
index b253906ba53..28cf55dee40 100644
--- a/tests/integ/modin/frame/test_axis.py
+++ b/tests/integ/modin/frame/test_axis.py
@@ -81,8 +81,7 @@ def test_index(test_df):
 
 
 @pytest.mark.parametrize("test_df", test_dfs)
-# One extra query to convert lazy index to series to set index
-@sql_count_checker(query_count=9, join_count=3)
+@sql_count_checker(query_count=8, join_count=3)
 def test_set_and_assign_index(test_df):
     def assign_index(df, keys):
         df.index = keys
@@ -290,7 +289,7 @@ def test_duplicate_labels_assignment():
         native_pd.DataFrame({"A": [3.14, 1.414, 1.732], "B": [9.8, 1.0, 0]}),
         "rows",
         [None] * 3,
-        6,
+        5,
         2,
     ],
     [  # Labels is a MultiIndex from tuples.
@@ -307,7 +306,7 @@ def test_duplicate_labels_assignment():
         native_pd.DataFrame({"A": ["foo", "bar", 3], "B": [4, "baz", 6]}),
         0,
         {1: "c", 2: "b", 3: "a"},
-        6,
+        5,
         2,
     ],
     [
@@ -327,7 +326,7 @@ def test_duplicate_labels_assignment():
         ),
         0,
         ['"row 1"', "row 2"],
-        6,
+        5,
         2,
     ],
     [
@@ -340,7 +339,7 @@ def test_duplicate_labels_assignment():
         ),
         "rows",
         list(range(10)),
-        6,
+        5,
         2,
     ],
     [
@@ -875,8 +874,7 @@ def test_set_axis_df_raises_value_error_diff_error_msg(
 ):
     # Should raise a ValueError if the labels for row-like axis are invalid.
     # The error messages do not match native pandas.
-    # one extra query to convert to native pandas in series constructor
-    with SqlCounter(query_count=2 if isinstance(labels, native_pd.MultiIndex) else 3):
+    with SqlCounter(query_count=2):
         with pytest.raises(ValueError, match=error_msg):
             pd.DataFrame(native_df).set_axis(labels, axis=axis)
 
@@ -894,7 +892,7 @@ def test_set_axis_df_raises_type_error_diff_error_msg(
         pd.DataFrame(native_df).set_axis(labels, axis=axis)
 
 
-@sql_count_checker(query_count=4, join_count=1)
+@sql_count_checker(query_count=3, join_count=1)
 def test_df_set_axis_copy_true(caplog):
     # Test that warning is raised when copy argument is used.
     native_df = native_pd.DataFrame({"A": [1.25], "B": [3]})
@@ -935,12 +933,11 @@ def test_df_set_axis_with_quoted_index():
     # check first that operation result is the same
     snow_df = pd.DataFrame(data)
     native_df = native_pd.DataFrame(data)
-    # One extra query to convert to native pandas in series constructor
-    with SqlCounter(query_count=4):
+    with SqlCounter(query_count=3):
         eval_snowpark_pandas_result(snow_df, native_df, helper)
 
     # then, explicitly compare axes
-    with SqlCounter(query_count=2):
+    with SqlCounter(query_count=1):
         ans = helper(snow_df)
 
     native_ans = helper(native_df)
diff --git a/tests/integ/modin/frame/test_drop_duplicates.py b/tests/integ/modin/frame/test_drop_duplicates.py
index 3cf38708038..35c4a8edb05 100644
--- a/tests/integ/modin/frame/test_drop_duplicates.py
+++ b/tests/integ/modin/frame/test_drop_duplicates.py
@@ -64,8 +64,7 @@ def test_drop_duplicates(subset, keep, ignore_index):
     query_count = 1
     join_count = 2
     if ignore_index is True:
-        # One extra query to convert index to native pandas in series constructor
-        query_count += 3
+        query_count += 2
         join_count += 3
     with SqlCounter(query_count=query_count, join_count=join_count):
         assert_frame_equal(
diff --git a/tests/integ/modin/frame/test_getitem.py b/tests/integ/modin/frame/test_getitem.py
index 746a8aa6550..fd4ede77d77 100644
--- a/tests/integ/modin/frame/test_getitem.py
+++ b/tests/integ/modin/frame/test_getitem.py
@@ -39,9 +39,9 @@
 def test_df_getitem_with_boolean_list_like(
     key, default_index_snowpark_pandas_df, default_index_native_df
 ):
-    # one added query to convert to native pandas and 2 added queries for series initialization
+    # one added query to convert to native pandas and 1 added query for series initialization
     with SqlCounter(
-        query_count=4 if isinstance(key, native_pd.Index) else 1, join_count=1
+        query_count=3 if isinstance(key, native_pd.Index) else 1, join_count=1
     ):
         # df[boolean list-like key] is the same as df.loc[:, boolean list-like key]
         if isinstance(key, native_pd.Index):
diff --git a/tests/integ/modin/frame/test_nlargest_nsmallest.py b/tests/integ/modin/frame/test_nlargest_nsmallest.py
index fa57ddeadd2..3b6318179f2 100644
--- a/tests/integ/modin/frame/test_nlargest_nsmallest.py
+++ b/tests/integ/modin/frame/test_nlargest_nsmallest.py
@@ -54,7 +54,7 @@ def test_nlargest_nsmallest_large_n(snow_df, native_df, method):
     )
 
 
-@sql_count_checker(query_count=5, join_count=1)
+@sql_count_checker(query_count=4, join_count=1)
 def test_nlargest_nsmallest_overlapping_index_name(snow_df, native_df, method):
     snow_df = snow_df.rename_axis("A")
     native_df = native_df.rename_axis("A")
diff --git a/tests/integ/modin/frame/test_set_index.py b/tests/integ/modin/frame/test_set_index.py
index e0088673282..ae035f0b3a4 100644
--- a/tests/integ/modin/frame/test_set_index.py
+++ b/tests/integ/modin/frame/test_set_index.py
@@ -80,8 +80,7 @@ def test_set_index_multiindex_columns(snow_df):
     )
 
 
-# One extra query to convert to native pandas to create series to set index
-@sql_count_checker(query_count=3)
+@sql_count_checker(query_count=2)
 def test_set_index_negative(snow_df, native_df):
     index = pd.Index([1, 2])
     native_index = native_pd.Index([1, 2])
@@ -122,7 +121,7 @@ def test_set_index_names(snow_df):
     # Verify name from input index is set.
     index = pd.Index([1, 2, 0])
     index.names = ["iname"]
-    with SqlCounter(query_count=3):
+    with SqlCounter(query_count=2):
         assert snow_df.set_index(index).index.names == ["iname"]
 
     # Verify names from input multiindex are set.
@@ -229,11 +228,8 @@ def test_set_index_pass_single_array(obj_type, drop, append, native_df):
             )
     else:
         expected_query_count = 3
-        if obj_type == pd.Series:
+        if obj_type == pd.Series or obj_type == pd.Index:
             expected_query_count = 4
-        # two extra queries, one to convert to native pandas (like series case) and one to create the series to set index
-        if obj_type == pd.Index:
-            expected_query_count = 5
         with SqlCounter(query_count=expected_query_count, join_count=1):
             eval_snowpark_pandas_result(
                 snow_df,
@@ -268,11 +264,7 @@ def test_set_index_pass_arrays(obj_type, drop, append, native_df):
         "a",
         key.to_pandas() if isinstance(key, (pd.Series, pd.Index)) else key,
     ]
-    query_count = 3
-    # one extra query to convert to series to set index
-    if obj_type == pd.Index:
-        query_count = 4
-    with SqlCounter(query_count=query_count, join_count=1):
+    with SqlCounter(query_count=3, join_count=1):
         eval_snowpark_pandas_result(
             snow_df,
             native_df,
@@ -433,7 +425,7 @@ def test_set_index_raise_on_len(length, obj_type, drop, append, native_df):
     msg = "Length mismatch: Expected 3 rows, received array of length.*"
     # wrong length directly
     # one extra query to create the series to set index
-    with SqlCounter(query_count=3 if obj_type == native_pd.Index else 2):
+    with SqlCounter(query_count=2):
         eval_snowpark_pandas_result(
             snow_df,
             native_df,
@@ -451,9 +443,6 @@ def test_set_index_raise_on_len(length, obj_type, drop, append, native_df):
     expected_query_count = 1
     if obj_type == native_pd.Series:
         expected_query_count = 0
-    # one extra query to convert to native pandas to create the series to set index
-    if obj_type == native_pd.Index:
-        expected_query_count = 2
     keys = ["a", key]
     native_keys = ["a", native_key]
     with SqlCounter(query_count=expected_query_count):
diff --git a/tests/integ/modin/series/test_axis.py b/tests/integ/modin/series/test_axis.py
index af00662f8db..d099272d6e9 100644
--- a/tests/integ/modin/series/test_axis.py
+++ b/tests/integ/modin/series/test_axis.py
@@ -30,7 +30,7 @@
         native_pd.Series({"A": [1, 2, 3], 5 / 6: [4, 5, 6]}),
         "index",
         [None] * 2,
-        4,
+        3,
         1,
     ],
     [
@@ -44,7 +44,7 @@
         ),
         "index",
         ["iccanobif", "serauqs", "semirp"],
-        4,
+        3,
         1,
     ],
     [
@@ -58,7 +58,7 @@
         ),
         "index",
         native_pd.Series(["iccanobif", "serauqs", "semirp"], name="reverse names"),
-        4,
+        3,
         1,
     ],
     [
@@ -73,7 +73,7 @@
         ),
         0,
         native_pd.Index([99, 999, 9999, 99999, 999999]),
-        4,
+        3,
         1,
     ],
     [
@@ -88,7 +88,7 @@
         ),
         0,
         native_pd.Index([99, 999, 9999, 99999, 999999], name="index with name"),
-        4,
+        3,
         1,
     ],
     [
@@ -104,7 +104,7 @@
         ),
         0,
         native_pd.Index([99, 999, 9999, 99999, 999999], name="index with name"),
-        4,
+        3,
         1,
     ],
     [  # Index is a MultiIndex from tuples.
@@ -165,14 +165,14 @@
         native_pd.Series({"A": ["foo", "bar", 3], "B": [4, "baz", 6]}),
         "index",
         {1: 1, 2: 2},
-        4,
+        3,
         1,
     ],
     [
         native_pd.Series({"A": ["foo", "bar", 3], "B": [4, "baz", 6]}),
         "rows",
         {1, 2},
-        4,
+        3,
         1,
     ],
 ]
@@ -440,7 +440,7 @@ def test_set_axis_series_raises_value_error_diff_error_msg(
 ):
     # Should raise a ValueError if length of labels passed in
     # don't match the number of rows.
-    with SqlCounter(query_count=2 if isinstance(labels, native_pd.MultiIndex) else 3):
+    with SqlCounter(query_count=2):
         with pytest.raises(ValueError, match=error_msg):
             pd.Series(ser).set_axis(labels, axis=axis)
 
@@ -474,7 +474,7 @@ def test_set_axis_series_raises_type_error(ser, axis, labels, error_msg):
         pd.Series(ser).set_axis(labels, axis=axis)
 
 
-@sql_count_checker(query_count=4, join_count=1)
+@sql_count_checker(query_count=3, join_count=1)
 def test_series_set_axis_copy_true(caplog):
     # Test that warning is raised when copy argument is used.
     series = native_pd.Series([1.25])
diff --git a/tests/integ/modin/series/test_getitem.py b/tests/integ/modin/series/test_getitem.py
index 3c297f32d0b..0ea84425d18 100644
--- a/tests/integ/modin/series/test_getitem.py
+++ b/tests/integ/modin/series/test_getitem.py
@@ -46,7 +46,7 @@ def getitem_helper(ser):
             _key, _ser = snow_key, ser
         return _ser[_key]
 
-    with SqlCounter(query_count=2 if isinstance(key, native_pd.Index) else 1):
+    with SqlCounter(query_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_series,
             default_index_native_series,
diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py
index 21fbf6aeafa..aa16a841f27 100644
--- a/tests/integ/modin/series/test_loc.py
+++ b/tests/integ/modin/series/test_loc.py
@@ -477,9 +477,6 @@ def type_convert(key, is_snow_type):
         return s.loc[type_convert(native_series_key, isinstance(s, pd.Series))]
 
     # default index
-    # Note: here number of queries are 2 due to the data type of the series is variant and to_pandas needs to call
-    # typeof to get the value types
-    # TODO: SNOW-933782 optimize to_pandas for variant columns to only fire one query
     with SqlCounter(query_count=1, join_count=1):
         eval_snowpark_pandas_result(
             default_index_snowpark_pandas_series,
diff --git a/tests/integ/modin/series/test_setitem.py b/tests/integ/modin/series/test_setitem.py
index 407e93c6a12..50405643bc3 100644
--- a/tests/integ/modin/series/test_setitem.py
+++ b/tests/integ/modin/series/test_setitem.py
@@ -1560,7 +1560,7 @@ def test_series_setitem_with_empty_key_and_empty_item_negative(
     else:
         snowpark_key = key
 
-    with SqlCounter(query_count=1 if isinstance(key, native_pd.Index) else 0):
+    with SqlCounter(query_count=0):
 
         err_msg = "The length of the value/item to set is empty"
         with pytest.raises(ValueError, match=err_msg):
@@ -1601,7 +1601,7 @@ def test_series_setitem_with_empty_key_and_empty_series_item(
     else:
         snowpark_key = key
 
-    with SqlCounter(query_count=2 if isinstance(key, native_pd.Index) else 1):
+    with SqlCounter(query_count=1):
         native_ser[key] = item
         snowpark_ser[
             pd.Series(snowpark_key)
@@ -1649,9 +1649,7 @@ def test_series_setitem_with_empty_key_and_scalar_item(
     else:
         snowpark_key = key
 
-    with SqlCounter(
-        query_count=2 if isinstance(key, native_pd.Index) else 1, join_count=2
-    ):
+    with SqlCounter(query_count=1, join_count=2):
         native_ser[key] = item
         snowpark_ser[
             pd.Series(snowpark_key)
diff --git a/tests/integ/modin/test_concat.py b/tests/integ/modin/test_concat.py
index 628af787ac4..1049d5ea21b 100644
--- a/tests/integ/modin/test_concat.py
+++ b/tests/integ/modin/test_concat.py
@@ -656,10 +656,8 @@ def test_concat_keys_with_none(df1, df2, axis):
     "name1, name2", [("one", "two"), ("one", None), (None, "two"), (None, None)]
 )
 def test_concat_with_keys_and_names(df1, df2, names, name1, name2, axis):
-    # One extra query to convert index to native pandas when creating df
     with SqlCounter(query_count=0 if name1 is None or axis == 1 else 3, join_count=0):
         df1 = df1.rename_axis(name1, axis=axis)
-    # One extra query to convert index to native pandas when creating df
     with SqlCounter(query_count=0 if name2 is None or axis == 1 else 3, join_count=0):
         df2 = df2.rename_axis(name2, axis=axis)
 
diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py
index c908b56c56a..ba20286579a 100644
--- a/tests/integ/modin/test_telemetry.py
+++ b/tests/integ/modin/test_telemetry.py
@@ -474,7 +474,7 @@ def test_telemetry_private_method(name, method, expected_query_count):
     assert data["api_calls"] == [{"name": f"DataFrame.DataFrame.{name}"}]
 
 
-@sql_count_checker(query_count=3)
+@sql_count_checker(query_count=2)
 def test_telemetry_property_index():
     df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
     df._query_compiler.snowpark_pandas_api_calls.clear()