snowflakedb · sfc-gh-vbudati · Sep 25, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
@@ -12,6 +12,7 @@
 import numpy as np
 import pandas as native_pd
 from pandas._typing import Scalar
+from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import is_integer_dtype, is_object_dtype, is_scalar
 from pandas.core.dtypes.inference import is_list_like
 
@@ -1998,6 +1999,28 @@ def rindex(lst: list, value: int) -> int:
     return len(lst) - lst[::-1].index(value) - 1
 
 
+def error_checking_for_init(
+    index: Any, dtype: Union[str, np.dtype, ExtensionDtype]
+) -> None:
+    """
+    Common error messages for the Series and DataFrame constructors.
+
+    Parameters
+    ----------
+    index: Any
+        The index to check.
+    dtype: str, numpy.dtype, or ExtensionDtype
+        The dtype to check.
+    """
+    from modin.pandas import DataFrame
+
+    if isinstance(index, DataFrame):  # pandas raises the same error
+        raise ValueError("Index data must be 1-dimensional")
+
+    if dtype == "category":
+        raise NotImplementedError("pandas type category is not implemented")
+
+
 def convert_index_to_qc(index: Any) -> Any:
     """
     Method to convert an object representing an index into a query compiler for set_index or reindex.

@@ -87,6 +87,7 @@
 from snowflake.snowpark.modin.plugin._internal.utils import (
     convert_index_to_list_of_qcs,
     convert_index_to_qc,
+    error_checking_for_init,
     is_repr_truncated,
 )
 from snowflake.snowpark.modin.plugin._typing import ListLike
@@ -484,8 +485,7 @@ def __init__(
         self._query_compiler = query_compiler
         return
 
-    if isinstance(index, DataFrame):  # pandas raises the same error
-        raise ValueError("Index data must be 1-dimensional")
+    error_checking_for_init(index, dtype)
 
     # The logic followed here is:
     # 1. Create a query_compiler from the provided data. If columns are provided, add/select the columns.
@@ -500,6 +500,7 @@ def __init__(
         # If the data is an Index object, convert it to a DataFrame to make sure that the values are in the
         # correct format: the values are a data column, not an index column.
         if data.name is None:
+            # If no name is provided, the default name is 0.
             new_name = 0 if columns is None else columns[0]
         else:
             new_name = data.name
@@ -510,6 +511,7 @@ def __init__(
         query_compiler = data._query_compiler.copy()
         # We set the column name if it is not in the provided Series `data`.
         if data.name is None:
+            # If no name is provided, the default name is 0.
             query_compiler = query_compiler.set_columns(columns or [0])
         if columns is not None and data.name not in columns:
             # If the columns provided are not in the named Series, pandas clears
@@ -607,9 +609,7 @@ def __init__(
                         if all(isinstance(v, Index) for v in data):
                             # Special case: if all the values are Index objects, they are always present in the
                             # final result with the provided column names. Therefore, rename the columns.
-                            new_qc = new_qc.set_columns(
-                                try_convert_index_to_native(columns)
-                            )
+                            new_qc = new_qc.set_columns(columns)
                         else:
                             new_qc = new_qc.reindex(axis=1, labels=columns)
                     self._query_compiler = new_qc
@@ -618,14 +618,16 @@ def __init__(
                 # If only some data is a Snowpark pandas object, convert it to pandas objects.
                 res = []
                 for v in data:
-                    if isinstance(v, (Index)):
-                        res.append(v.to_pandas())
-                    elif isinstance(v, BasePandasDataset):
+                    if isinstance(v, (Index, BasePandasDataset)):
                         res.append(v.to_pandas())
+                    # elif is_dict_like(v) or isinstance(v, (native_pd.Series, native_pd.DataFrame, native_pd.Index)):
+                    #     res.append(v)
                     else:
-                        # Need to convert this is a native pandas object since native pandas incorrectly
-                        # tries to perform `get_indexer` on it.
-                        res.append(native_pd.Index(v if is_list_like(v) else [v]))
+                        # # Need to convert this is a native pandas object since native pandas incorrectly
+                        # # tries to perform `get_indexer` on it. Specify dtype=object so that pandas does not
+                        # # cast the data provided. In some cases, None turns to NaN, which is not desired.
+                        # res.append(native_pd.Index(v, dtype=object) if is_list_like(v) else v)
+                        res.append(v)
                 data = res
 
         query_compiler = from_pandas(
@@ -662,13 +664,14 @@ def __init__(
 
     # 3. If data is a DataFrame, filter result
     # ----------------------------------------
-    if isinstance(data, DataFrame):
-        # To select the required index and columns for the resultant DataFrame,
-        # perform .loc[] on the created query compiler.
-        index = slice(None) if index is None else index
-        columns = slice(None) if columns is None else columns
+    if isinstance(data, DataFrame) and columns is not None:
+        # To select the columns for the resultant DataFrame, perform .loc[] on the created query compiler.
+        # This step is performed to ensure that the right columns are picked from the InternalFrame since we
+        # never explicitly drop the unwanted columns.
         query_compiler = (
-            DataFrame(query_compiler=query_compiler).loc[index, columns]._query_compiler
+            DataFrame(query_compiler=query_compiler)
+            .loc[slice(None), columns]
+            ._query_compiler
         )
 
     # 4. Setting the query compiler
@@ -1181,6 +1184,9 @@ def insert(
     # Dictionary keys are treated as index column and this should be joined with
     # index of target dataframe. This behavior is similar to 'value' being DataFrame
     # or Series, so we simply create Series from dict data here.
+    if isinstance(value, set):
+        raise TypeError(f"'{type(value).__name__}' type is unordered")
+
     if isinstance(value, dict):
         value = Series(value, name=column)
 

@@ -45,6 +45,7 @@
 from snowflake.snowpark.modin.plugin._internal.utils import (
     convert_index_to_list_of_qcs,
     convert_index_to_qc,
+    error_checking_for_init,
 )
 from snowflake.snowpark.modin.plugin._typing import DropKeep, ListLike
 from snowflake.snowpark.modin.plugin.utils.error_message import (
@@ -367,8 +368,7 @@ def __init__(
             self.name = name
         return
 
-    if isinstance(index, spd.DataFrame):  # pandas raises the same error
-        raise ValueError("Index data must be 1-dimensional")
+    error_checking_for_init(index, dtype)
 
     if isinstance(data, spd.DataFrame):
         # pandas raises an ambiguous error:
@@ -398,11 +398,12 @@ def __init__(
     else:
         # CASE IV: Non-Snowpark pandas data
         # If the data is not a Snowpark pandas object, convert it to a query compiler.
+        # The query compiler uses the '__reduced__' name internally as a column name to represent pandas
+        # Series objects that are not explicitly assigned a name.
+        # This helps to distinguish between an N-element Series and 1xN DataFrame.
         name = name or MODIN_UNNAMED_SERIES_LABEL
-        if (
-            isinstance(data, (native_pd.Series, native_pd.Index))
-            and data.name is not None
-        ):
+        if hasattr(data, "name") and data.name is not None:
+            # If data is an object that has a name field, use that as the name of the new Series.
             name = data.name
         # If any of the values are Snowpark pandas objects, convert them to native pandas objects.
         if not isinstance(
@@ -422,9 +423,9 @@ def __init__(
             native_pd.DataFrame(
                 native_pd.Series(
                     data=data,
-                    dtype=dtype,
-                    # Handle setting the index, if it is a lazy index, outside this block.
+                    # If the index is a lazy index, handle setting it outside this block.
                     index=None if isinstance(index, (Index, Series)) else index,
+                    dtype=dtype,
                     name=name,
                     copy=copy,
                     fastpath=fastpath,

@@ -473,22 +473,23 @@ def test_empty_index(index, expected_index_dtype):
 
 
 @pytest.mark.parametrize(
-    "input_data, type_msg",
+    "input_data, dtype, type_msg",
     [
-        (native_pd.Categorical([1, 2, 3, 1, 2, 3]), "category"),
-        (native_pd.Categorical(["a", "b", "c", "a", "b", "c"]), "category"),
+        (native_pd.Categorical([1, 2, 3, 1, 2, 3]), "category", "category"),
+        (native_pd.Categorical(["a", "b", "c", "a", "b", "c"]), "category", "category"),
         (
             native_pd.period_range("2015-02-03 11:22:33.4567", periods=5, freq="s"),
+            None,
             r"period\[s\]",
         ),
     ],
 )
 @sql_count_checker(query_count=0)
-def test_unsupported_dtype_raises(input_data, type_msg) -> None:
+def test_unsupported_dtype_raises(input_data, dtype, type_msg) -> None:
     with pytest.raises(
         NotImplementedError, match=f"pandas type {type_msg} is not implemented"
     ):
-        pd.Series(input_data)
+        pd.Series(input_data, dtype=dtype)
 
 
 @pytest.mark.parametrize(

diff --git a/tests/integ/modin/frame/test_idxmax_idxmin.py b/tests/integ/modin/frame/test_idxmax_idxmin.py
@@ -194,7 +194,7 @@ def test_idxmax_idxmin_with_dates(func, axis):
     )
 
 
-@sql_count_checker(query_count=1, join_count=1)
+@sql_count_checker(query_count=1)
 @pytest.mark.parametrize("func", ["idxmax", "idxmin"])
 @pytest.mark.parametrize(
     "axis",

@@ -212,7 +212,7 @@ def test_insert_dataframe_shape_negative(native_df):
         (np.ones((1, 1)), 1),
         ([1, 2], 1),  # len < number of rows
         ((6, 7, 8, 9), 1),  # len > number of rows
-        ({"a", "b", "c"}, 1),  # python set
+        ({"a", "b", "c"}, 0),  # python set
     ],
 )
 def test_insert_value_negative(native_df, value, expected_query_count):
@@ -725,12 +725,10 @@ def test_insert_multiindex_column_negative(snow_df, columns, insert_label):
         [["a", "b", "b", "d", "e"], ["x", "y", "z", "u", "u"], True],
     ],
 )
-@sql_count_checker(query_count=1, join_count=3)
+@sql_count_checker(query_count=3, join_count=1)
 def test_insert_with_unique_and_duplicate_index_values(
     index_values, other_index_values, expect_mismatch
 ):
-    # Two of the three joins come from creating the DataFrame with non-Snowpark pandas data
-    # and a Snowpark pandas Index. The third join is from the insert operation.
     data = list(range(5))
     data1 = {"foo": data}
     data2 = {"bar": [val * 10 for val in data]}

@@ -3945,12 +3945,12 @@ def test_raise_set_cell_with_list_like_value_error():
                 reason="SNOW-1652608 result series name incorrectly set"
             ),
         ),  # 1 join fron df creation, 1 join from squeeze, 2 joins from to_pandas during eval
-        (["1 day", "3 days"], 1, 2),
-        ([True, False, False], 1, 2),
-        (slice(None, "4 days"), 1, 1),
-        (slice(None, "4 days", 2), 1, 1),
-        (slice("1 day", "2 days"), 1, 1),
-        (slice("1 day 1 hour", "2 days 2 hours", -1), 1, 1),
+        (["1 day", "3 days"], 1, 1),
+        ([True, False, False], 1, 1),
+        (slice(None, "4 days"), 1, 0),
+        (slice(None, "4 days", 2), 1, 0),
+        (slice("1 day", "2 days"), 1, 0),
+        (slice("1 day 1 hour", "2 days 2 hours", -1), 1, 0),
     ],
 )
 def test_df_loc_get_with_timedelta(key, query_count, join_count):
@@ -4017,7 +4017,7 @@ def test_df_loc_get_with_timedelta(key, query_count, join_count):
         ),
     ],
 )
-@sql_count_checker(query_count=1, join_count=1)
+@sql_count_checker(query_count=2)
 def test_df_loc_get_with_timedelta_behavior_difference(key, expected_result):
     # In these test cases, native pandas raises a KeyError but Snowpark pandas works correctly.
     data = {
@@ -4037,7 +4037,7 @@ def test_df_loc_get_with_timedelta_behavior_difference(key, expected_result):
     assert_frame_equal(actual_result, expected_result)
 
 
-@sql_count_checker(query_count=2, join_count=2)
+@sql_count_checker(query_count=3, join_count=1)
 def test_df_loc_get_with_timedeltaindex_key():
     data = {
         "A": [1, 2, 3],

@@ -683,7 +683,7 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other):
         native_other = other
         snow_other = other
 
-    expected_join_count = 2 if isinstance(other, int) else 3
+    expected_join_count = 1 if isinstance(other, int) else 2
     with SqlCounter(query_count=1, join_count=expected_join_count):
         eval_snowpark_pandas_result(
             snow_df,
@@ -694,9 +694,8 @@ def test_dataframe_mask_with_duplicated_index_aligned(cond_frame, other):
         )
 
 
-# Three extra joins when creating the 3 snowpark pandas dataframes with non-Snowpark pandas
-# data and Snowpark pandas Index.
-@sql_count_checker(query_count=1, join_count=5)
+# Three extra queries to convert to native index for dataframe constructor when creating the 3 snowpark pandas dataframes
+@sql_count_checker(query_count=4, join_count=2)
 def test_dataframe_mask_with_duplicated_index_unaligned():
     data = [3, 4, 5, 2]
     df_index = pd.Index([2, 1, 2, 3], name="index")