Skip to content

Commit 34a2777

Browse files
committed
table_from_frame: replace nan with String.Unknown for string variable
1 parent c860359 commit 34a2777

File tree

2 files changed

+40
-10
lines changed

2 files changed

+40
-10
lines changed

Orange/data/pandas_compat.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,11 @@ def vars_from_df(df, role=None, force_nominal=False):
280280
raise ValueError("String variable must be in metas.")
281281
_role = Role.Meta
282282
var = StringVariable(str(column))
283-
expr = lambda s, _: np.asarray(s, dtype=object)
283+
expr = lambda s, _: np.asarray(
284+
# replace nan with object that Orange use for unknown and
285+
# assure that all values are string
286+
s.fillna(StringVariable.Unknown).astype(str), dtype=object
287+
)
284288

285289
cols[_role].append(column)
286290
exprs[_role].append(expr)

Orange/data/tests/test_pandas.py

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@ def test_table_from_frame(self):
3434
[0, pd.Timestamp('1724-12-20').timestamp()],
3535
[0, pd.Timestamp('1724-12-20').timestamp()],
3636
[nan, nan]])
37-
np.testing.assert_equal(table.metas.tolist(), [['a'],
38-
['b'],
39-
['c'],
40-
[nan]])
37+
np.testing.assert_equal(
38+
table.metas.tolist(), [["a"], ["b"], ["c"], [StringVariable.Unknown]]
39+
)
4140
names = [var.name for var in table.domain.attributes]
4241
types = [type(var) for var in table.domain.attributes]
4342
self.assertEqual(names, ['1', '2'])
@@ -63,10 +62,9 @@ def test_table_from_frame(self):
6362
[1, 0, pd.Timestamp('1724-12-20').timestamp()],
6463
[0, 0, pd.Timestamp('1724-12-20').timestamp()],
6564
[0, nan, nan]])
66-
np.testing.assert_equal(table.metas.tolist(), [['a'],
67-
['b'],
68-
['c'],
69-
[nan]])
65+
np.testing.assert_equal(
66+
table.metas.tolist(), [["a"], ["b"], ["c"], [StringVariable.Unknown]]
67+
)
7068
names = [var.name for var in table.domain.attributes]
7169
types = [type(var) for var in table.domain.attributes]
7270
self.assertEqual(names, ['index', '1', '2'])
@@ -383,7 +381,7 @@ def test_table_from_frame_timezones(self):
383381
],
384382
)
385383

386-
def test_table_from_frame_no_datetim(self):
384+
def test_table_from_frame_no_datetime(self):
387385
"""
388386
In case when dtype of column is object and column contains numbers only,
389387
column could be recognized as a TimeVarialbe since pd.to_datetime can parse
@@ -402,6 +400,34 @@ def test_table_from_frame_no_datetim(self):
402400
# check if exactly ContinuousVariable and not subtype TimeVariable
403401
self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)
404402

403+
def testa_table_from_frame_string(self):
404+
"""
405+
Test if string-like variables are handled correctly and nans are replaced
406+
with String.Unknown
407+
"""
408+
from Orange.data.pandas_compat import table_from_frame
409+
410+
# s1 contains nan and s2 contains pd.Na
411+
df = pd.DataFrame(
412+
[["a", "b"], ["c", "d"], ["e", "f"], [5, "c"], [np.nan, np.nan]],
413+
columns=["s1", "s2"],
414+
).astype({"s1": "object", "s2": "string"})
415+
table = table_from_frame(df)
416+
np.testing.assert_array_equal(np.empty((5, 0)), table.X)
417+
np.testing.assert_array_equal(
418+
np.array(
419+
[
420+
["a", "b"],
421+
["c", "d"],
422+
["e", "f"],
423+
["5", "c"],
424+
[StringVariable.Unknown, StringVariable.Unknown],
425+
]
426+
),
427+
table.metas,
428+
)
429+
self.assertTrue(all(isinstance(v, StringVariable) for v in table.domain.metas))
430+
405431
def test_time_variable_compatible(self):
406432
from Orange.data.pandas_compat import table_from_frame
407433

0 commit comments

Comments
 (0)