Added correct casting and mod operation (#172)

nils-braun · web-flow · commit 90569c98af5f · 2021-05-13T18:32:36.000+02:00
diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py
@@ -108,7 +108,7 @@ def wrapped_read_function(location, column_information, **kwargs):
             df = df.rename(columns=dict(zip(df.columns, column_information.keys())))
 
             for col, expected_type in column_information.items():
-                df = cast_column_type(df, col, expected_type)
+                df[col] = cast_column_type(df[col], expected_type)
 
             return df
 
@@ -146,8 +146,9 @@ def wrapped_read_function(location, column_information, **kwargs):
 
                 partition_id = 0
                 for partition_key, partition_type in partition_information.items():
-                    table[partition_key] = partition_values[partition_id]
-                    table = cast_column_type(table, partition_key, partition_type)
+                    table[partition_key] = cast_column_type(
+                        partition_values[partition_id], partition_type
+                    )
 
                     partition_id += 1
 
diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py
@@ -255,24 +255,20 @@ def similar_type(lhs: type, rhs: type) -> bool:
     return False
 
 
-def cast_column_type(
-    df: dd.DataFrame, column_name: str, expected_type: type
-) -> dd.DataFrame:
+def cast_column_type(column: dd.Series, expected_type: type) -> dd.Series:
     """
     Cast the type of the given column to the expected type,
     if they are far "enough" away.
     This means, a float will never be converted into a double
     or a tinyint into another int - but a string to an integer etc.
     """
-    current_type = df[column_name].dtype
+    current_type = column.dtype
 
-    logger.debug(
-        f"Column {column_name} has type {current_type}, expecting {expected_type}..."
-    )
+    logger.debug(f"Column has type {current_type}, expecting {expected_type}...")
 
     if similar_type(current_type, expected_type):
         logger.debug("...not converting.")
-        return df
+        return column
 
     current_float = pd.api.types.is_float_dtype(current_type)
     expected_integer = pd.api.types.is_integer_dtype(expected_type)
@@ -282,9 +278,9 @@ def cast_column_type(
         # because NA is a different type. It works with np.NaN though.
         # For our use case, that does not matter, as the conversion to integer later
         # will convert both NA and np.NaN to NA.
-        df[column_name] = da.trunc(df[column_name].fillna(value=np.NaN))
+        column = da.trunc(column.fillna(value=np.NaN))
 
-    logger.debug(f"Need to cast {column_name} from {current_type} to {expected_type}")
-    df[column_name] = df[column_name].astype(expected_type)
+    logger.debug(f"Need to cast from {current_type} to {expected_type}")
+    column = column.astype(expected_type)
 
-    return df
+    return column
diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py
@@ -106,6 +106,6 @@ def fix_dtype_to_row_type(
             expected_type = sql_to_python_type(field_type)
             field_name = cc.get_backend_by_frontend_index(index)
 
-            df = cast_column_type(df, field_name, expected_type)
+            df[field_name] = cast_column_type(df[field_name], expected_type)
 
         return DataContainer(df, dc.column_container)
diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py
@@ -14,7 +14,7 @@
 from dask.utils import random_state_data
 
 from dask_sql.datacontainer import DataContainer
-from dask_sql.mappings import sql_to_python_type
+from dask_sql.mappings import cast_column_type, sql_to_python_type
 from dask_sql.physical.rex import RexConverter
 from dask_sql.physical.rex.base import BaseRexPlugin
 from dask_sql.utils import (
@@ -179,6 +179,21 @@ def case(self, *operands) -> SeriesOrScalar:
             return then if where else other
 
 
+class CastOperation(Operation):
+    """The cast operator"""
+
+    needs_rex = True
+
+    def __init__(self):
+        super().__init__(self.cast)
+
+    def cast(self, operand, rex=None) -> SeriesOrScalar:
+        output_type = str(rex.getType())
+        output_type = sql_to_python_type(output_type.upper())
+
+        return cast_column_type(operand, output_type)
+
+
 class IsFalseOperation(Operation):
     """The is false operator"""
 
@@ -650,7 +665,7 @@ class RexCallPlugin(BaseRexPlugin):
         "is distinct from": NotOperation().of(IsNotDistinctOperation()),
         "is not distinct from": IsNotDistinctOperation(),
         # special operations
-        "cast": lambda x: x,
+        "cast": CastOperation(),
         "case": CaseOperation(),
         "like": LikeOperation(),
         "similar to": SimilarOperation(),
@@ -680,7 +695,7 @@ class RexCallPlugin(BaseRexPlugin):
         "floor": CeilFloorOperation("floor"),
         "log10": Operation(da.log10),
         "ln": Operation(da.log),
-        # "mod": Operation(da.mod), # needs cast
+        "mod": Operation(da.mod),
         "power": Operation(da.power),
         "radians": Operation(da.radians),
         "round": TensorScalarOperation(lambda x, *ops: x.round(*ops), np.round),
diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py
@@ -311,6 +311,7 @@ def test_math_operations(c, df):
             , FLOOR(b) AS "floor"
             , LOG10(b) AS "log10"
             , LN(b) AS "ln"
+            , MOD(b, 4) AS "mod"
             , POWER(b, 2) AS "power"
             , POWER(b, a) AS "power2"
             , RADIANS(b) AS "radians"
@@ -339,6 +340,7 @@ def test_math_operations(c, df):
     expected_df["floor"] = np.floor(df.b)
     expected_df["log10"] = np.log10(df.b)
     expected_df["ln"] = np.log(df.b)
+    expected_df["mod"] = np.mod(df.b, 4)
     expected_df["power"] = np.power(df.b, 2)
     expected_df["power2"] = np.power(df.b, df.a)
     expected_df["radians"] = df.b / 180 * np.pi