From 32a401c65c8c4d93094e73ee2adff968cd745cdf Mon Sep 17 00:00:00 2001 From: "Maarten A. Breddels" Date: Thu, 16 Sep 2021 13:56:48 +0200 Subject: [PATCH] correct descibe_null for arrow and numpy --- packages/vaex-core/vaex/dataframe_protocol.py | 15 +++++++++++++-- tests/dataframe_protocol_test.py | 16 +++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/packages/vaex-core/vaex/dataframe_protocol.py b/packages/vaex-core/vaex/dataframe_protocol.py index 476c5c99f2..d74f6d0f51 100644 --- a/packages/vaex-core/vaex/dataframe_protocol.py +++ b/packages/vaex-core/vaex/dataframe_protocol.py @@ -422,8 +422,19 @@ def describe_null(self) -> Tuple[int, Any]: kind = self.dtype[0] value = None if kind in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL): - null = 3 - value = 1 + if self._col.dtype.is_arrow: + # arrow arrays always allow for null values + # where 0 encodes a null/missing value + null = 3 + value = 0 + elif self._col.is_masked: + # masked arrays are always numpy.ma arrays + null = 4 + value = 1 + else: + # otherwise we have a normal numpy array + null = 0 + value = None else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") diff --git a/tests/dataframe_protocol_test.py b/tests/dataframe_protocol_test.py index 00a4a04e45..9f07a35ab1 100644 --- a/tests/dataframe_protocol_test.py +++ b/tests/dataframe_protocol_test.py @@ -57,7 +57,10 @@ def test_mixed_intfloatbool(df_factory): with pytest.raises(TypeError): assert df2.__dataframe__().get_column_by_name("y").describe_categorical - assert df2.__dataframe__().get_column_by_name("y").describe_null == (3, 1) + if df2['y'].dtype.is_arrow: + assert df2.__dataframe__().get_column_by_name("y").describe_null == (3, 0) + else: + assert df2.__dataframe__().get_column_by_name("y").describe_null == (0, None) assert_dataframe_equal(df.__dataframe__(), df) @@ -127,17 +130,17 @@ def test_categorical_ordinal(): col = df.__dataframe__().get_column_by_name("colors") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.describe_categorical == (False, True, {0: "red", 1: "green", 2: "blue"}) - assert col.describe_null == (3, 1) + assert col.describe_null == (0, None) assert col.dtype == (23, 64, "u", "=") col2 = df.__dataframe__().get_column_by_name("year") assert col2.dtype[0] == _DtypeKind.CATEGORICAL assert col2.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019}) - assert col2.describe_null == (3, 1) + assert col2.describe_null == (0, None) assert col2.dtype == (23, 64, "u", "=") col3 = df.__dataframe__().get_column_by_name("weekday") assert col3.dtype[0] == _DtypeKind.CATEGORICAL assert col3.describe_categorical == (False, True, {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}) - assert col3.describe_null == (3, 1) + assert col3.describe_null == (0, None) assert col3.dtype == (23, 64, "u", "=") df2 = _from_dataframe_to_vaex(df.__dataframe__()) @@ -158,7 +161,10 @@ def test_arrow_dictionary(): col = df.__dataframe__().get_column_by_name("x") assert col.dtype[0] == _DtypeKind.CATEGORICAL assert col.describe_categorical == (False, True, {0: "foo", 1: "bar", 2: "baz"}) - assert col.describe_null == (3, 1) + if df['x'].dtype.is_arrow: + assert col.describe_null == (3, 0) + else: + assert col.describe_null == (0, None) assert col.dtype == (23, 64, "u", "=") df2 = _from_dataframe_to_vaex(df.__dataframe__())