Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def test_add(self):

for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(b_pser + pser, b_psser + psser, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(
b_pser + pser, b_psser + psser, check_exact=False, ignore_null=ignore_null
)
for col in self.non_numeric_df_cols:
pser, psser = pdf[col], psdf[col]
if col == "bool":
Expand All @@ -74,7 +77,10 @@ def test_sub(self):
self.assertRaises(TypeError, lambda: b_psser - True)

for col in self.numeric_df_cols:
self.assert_eq(b_pser - pdf[col], b_psser - psdf[col], check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(
b_pser - pdf[col], b_psser - psdf[col], check_exact=False, ignore_null=ignore_null
)

for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser - psdf[col])
Expand All @@ -91,7 +97,10 @@ def test_mul(self):
self.assert_eq(b_pser * False, b_psser * False)

for col in self.numeric_df_cols:
self.assert_eq(b_pser * pdf[col], b_psser * psdf[col], check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(
b_pser * pdf[col], b_psser * psdf[col], check_exact=False, ignore_null=ignore_null
)

for col in self.non_numeric_df_cols:
pser, psser = pdf[col], psdf[col]
Expand Down Expand Up @@ -149,7 +158,10 @@ def test_mod(self):
self.assertRaises(TypeError, lambda: b_psser % True)

for col in self.numeric_df_cols:
self.assert_eq(b_pser % pdf[col], b_psser % psdf[col], check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(
b_pser % pdf[col], b_psser % psdf[col], check_exact=False, ignore_null=ignore_null
)

for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser % psdf[col])
Expand Down
46 changes: 34 additions & 12 deletions python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,26 @@ def test_add(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser + pser, psser + psser, check_exact=False)
self.assert_eq(pser + 1, psser + 1, check_exact=False)
self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool), check_exact=False)
self.assert_eq(pser + True, psser + True, check_exact=False)
self.assert_eq(pser + False, psser + False, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(pser + pser, psser + psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(pser + 1, psser + 1, check_exact=False, ignore_null=ignore_null)
self.assert_eq(
pser + pser.astype(bool),
psser + psser.astype(bool),
check_exact=False,
ignore_null=ignore_null,
)
self.assert_eq(pser + True, psser + True, check_exact=False, ignore_null=ignore_null)
self.assert_eq(pser + False, psser + False, check_exact=False, ignore_null=ignore_null)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pser + pdf[n_col], psser + psdf[n_col], check_exact=False)
self.assert_eq(
pser + pdf[n_col],
psser + psdf[n_col],
check_exact=False,
ignore_null=ignore_null,
)
else:
self.assertRaises(TypeError, lambda: psser + psdf[n_col])

Expand All @@ -63,15 +74,26 @@ def test_sub(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser - pser, psser - psser, check_exact=False)
self.assert_eq(pser - 1, psser - 1, check_exact=False)
self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool), check_exact=False)
self.assert_eq(pser - True, psser - True, check_exact=False)
self.assert_eq(pser - False, psser - False, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(pser - pser, psser - psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(pser - 1, psser - 1, check_exact=False, ignore_null=ignore_null)
self.assert_eq(
pser - pser.astype(bool),
psser - psser.astype(bool),
check_exact=False,
ignore_null=ignore_null,
)
self.assert_eq(pser - True, psser - True, check_exact=False, ignore_null=ignore_null)
self.assert_eq(pser - False, psser - False, check_exact=False, ignore_null=ignore_null)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pser - pdf[n_col], psser - psdf[n_col], check_exact=False)
self.assert_eq(
pser - pdf[n_col],
psser - psdf[n_col],
check_exact=False,
ignore_null=ignore_null,
)
else:
self.assertRaises(TypeError, lambda: psser - psdf[n_col])

Expand Down
15 changes: 11 additions & 4 deletions python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,17 @@ def test_mod(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser % pser, psser % psser, check_exact=False)
self.assert_eq(pser % pser.astype(bool), psser % psser.astype(bool), check_exact=False)
self.assert_eq(pser % True, psser % True, check_exact=False)
self.assert_eq(pser % 1, psser % 1, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(pser % pser, psser % psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(
pser % pser.astype(bool),
psser % psser.astype(bool),
check_exact=False,
ignore_null=ignore_null,
)
self.assert_eq(pser % True, psser % True, check_exact=False, ignore_null=ignore_null)
self.assert_eq(pser % 1, psser % 1, check_exact=False, ignore_null=ignore_null)

if not col.startswith("decimal"):
self.assert_eq(pser % 0, psser % 0, check_exact=False)
if col in ["int", "int32"]:
Expand Down
11 changes: 8 additions & 3 deletions python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def test_from_to_pandas(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser, psser._to_pandas(), check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(pser, psser._to_pandas(), check_exact=False, ignore_null=ignore_null)
self.assert_eq(ps.from_pandas(pser), psser)

def test_isnull(self):
Expand All @@ -113,12 +114,16 @@ def test_isnull(self):
def test_neg(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
self.assert_eq(-pdf[col], -psdf[col], check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(-pdf[col], -psdf[col], check_exact=False, ignore_null=ignore_null)

def test_abs(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
self.assert_eq(abs(pdf[col]), abs(psdf[col]), check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(
abs(pdf[col]), abs(psdf[col]), check_exact=False, ignore_null=ignore_null
)

def test_invert(self):
pdf, psdf = self.pdf, self.psdf
Expand Down
28 changes: 16 additions & 12 deletions python/pyspark/pandas/tests/data_type_ops/test_num_reverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,35 +44,38 @@ def test_radd(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(1 + pser, 1 + psser, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(1 + pser, 1 + psser, check_exact=False, ignore_null=ignore_null)
# self.assert_eq(0.1 + pser, 0.1 + psser)
self.assertRaises(TypeError, lambda: "x" + psser)
self.assert_eq(True + pser, True + psser, check_exact=False)
self.assert_eq(False + pser, False + psser, check_exact=False)
self.assert_eq(True + pser, True + psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(False + pser, False + psser, check_exact=False, ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) + psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) + psser)

def test_rsub(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(1 - pser, 1 - psser, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(1 - pser, 1 - psser, check_exact=False, ignore_null=ignore_null)
# self.assert_eq(0.1 - pser, 0.1 - psser)
self.assertRaises(TypeError, lambda: "x" - psser)
self.assert_eq(True - pser, True - psser, check_exact=False)
self.assert_eq(False - pser, False - psser, check_exact=False)
self.assert_eq(True - pser, True - psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(False - pser, False - psser, check_exact=False, ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) - psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) - psser)

def test_rmul(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(1 * pser, 1 * psser, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(1 * pser, 1 * psser, check_exact=False, ignore_null=ignore_null)
# self.assert_eq(0.1 * pser, 0.1 * psser)
self.assertRaises(TypeError, lambda: "x" * psser)
self.assert_eq(True * pser, True * psser, check_exact=False)
self.assert_eq(False * pser, False * psser, check_exact=False)
self.assert_eq(True * pser, True * psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(False * pser, False * psser, check_exact=False, ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) * psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) * psser)

Expand Down Expand Up @@ -116,10 +119,11 @@ def test_rmod(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(1 % pser, 1 % psser, check_exact=False)
ignore_null = self.ignore_null(col)
self.assert_eq(1 % pser, 1 % psser, check_exact=False, ignore_null=ignore_null)
# self.assert_eq(0.1 % pser, 0.1 % psser)
self.assert_eq(True % pser, True % psser, check_exact=False)
self.assert_eq(False % pser, False % psser, check_exact=False)
self.assert_eq(True % pser, True % psser, check_exact=False, ignore_null=ignore_null)
self.assert_eq(False % pser, False % psser, check_exact=False, ignore_null=ignore_null)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) % psser)

Expand Down
4 changes: 4 additions & 0 deletions python/pyspark/pandas/tests/data_type_ops/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import pandas as pd

import pyspark.pandas as ps
from pyspark.loose_version import LooseVersion
from pyspark.pandas.typedef.typehints import (
extension_dtypes_available,
extension_float_dtypes_available,
Expand Down Expand Up @@ -219,3 +220,6 @@ def check_extension(self, left, right):
pandas versions. Please refer to https://github.com/pandas-dev/pandas/issues/39410.
"""
self.assert_eq(left, right)

def ignore_null(self, col):
return LooseVersion(pd.__version__) >= LooseVersion("3.0") and col == "decimal_nan"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is decimal_nan the only case where this happens? I think we can have this matter any time we do some calculation that results in a null-like value.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, as long as I've observed so far, it only happens with decimal_nan.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can use subTest to check the parameters in the test loops. WDYT?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think most of the issues happened because decimal_nan actually has null-like values. Some of the columns don't. We should probably not use it as a fact. I think a better way is probably check if there's any null-like value in the column with col.isna().to_numpy().any()?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The actually issue should be how decimal.Decimal(np.nan) is handled?

The other numeric types, None will be NaN when converting to pandas, which is well-handled.
The other types, None will be None anyway.

But decimal.Decimal(np.nan) is kind of special value that Spark can't handle well anyway?

It will be None in pandas API on Spark as Spark doesn't have a concept of NaN on decimal type.

>>> pdf = pd.DataFrame([decimal.Decimal(np.nan), None])
>>> pdf
      0
0   NaN
1  None
>>>
>>> psdf = ps.from_pandas(pdf)
>>> psdf
      0
0  None
1  None

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay so for these tests, we are doing operations on a "decimal column" - which is really just object in pandas because pandas does not have a decimal dtype. The psdf output, unfortunately, also has type object because object + anything is object. So there is no way for us to know that we should convert this None to np.nan.

Then I guess this change is fine - we should ignore the null differences from operation of "decimal" data - which is just object.