Skip to content

Commit 4d8953d

Browse files
committed
table_from_frame: replace nan with String.Unknown for string variable
1 parent c860359 commit 4d8953d

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

Orange/data/pandas_compat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def vars_from_df(df, role=None, force_nominal=False):
280280
raise ValueError("String variable must be in metas.")
281281
_role = Role.Meta
282282
var = StringVariable(str(column))
283-
expr = lambda s, _: np.asarray(s, dtype=object)
283+
expr = lambda s, _: np.asarray(s.fillna(StringVariable.Unknown), dtype=object)
284284

285285
cols[_role].append(column)
286286
exprs[_role].append(expr)

Orange/data/tests/test_pandas.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@ def test_table_from_frame(self):
3434
[0, pd.Timestamp('1724-12-20').timestamp()],
3535
[0, pd.Timestamp('1724-12-20').timestamp()],
3636
[nan, nan]])
37-
np.testing.assert_equal(table.metas.tolist(), [['a'],
38-
['b'],
39-
['c'],
40-
[nan]])
37+
np.testing.assert_equal(
38+
table.metas.tolist(), [["a"], ["b"], ["c"], [StringVariable.Unknown]]
39+
)
4140
names = [var.name for var in table.domain.attributes]
4241
types = [type(var) for var in table.domain.attributes]
4342
self.assertEqual(names, ['1', '2'])
@@ -383,7 +382,7 @@ def test_table_from_frame_timezones(self):
383382
],
384383
)
385384

386-
def test_table_from_frame_no_datetim(self):
385+
def test_table_from_frame_no_datetime(self):
387386
"""
388387
In case when dtype of column is object and column contains numbers only,
389388
column could be recognized as a TimeVarialbe since pd.to_datetime can parse
@@ -402,6 +401,25 @@ def test_table_from_frame_no_datetim(self):
402401
# check if exactly ContinuousVariable and not subtype TimeVariable
403402
self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)
404403

404+
def testa_table_from_frame_string(self):
405+
"""
406+
Test if string-like variables are handled correctly and nans are replaced
407+
with empty string - unknown in Orange table for string variable
408+
"""
409+
from Orange.data.pandas_compat import table_from_frame
410+
411+
# s1 contains nan and s2 contains pd.Na
412+
df = pd.DataFrame(
413+
[["a", "b"], ["c", "d"], ["e", "f"], [np.nan, np.nan]],
414+
columns=["s1", "s2"],
415+
).astype({"s1": "object", "s2": "string"})
416+
table = table_from_frame(df)
417+
np.testing.assert_array_equal(np.empty((4, 0)), table.X)
418+
np.testing.assert_array_equal(
419+
np.array([["a", "b"], ["c", "d"], ["e", "f"], [StringVariable.Unknown, StringVariable.Unknown]]), table.metas
420+
)
421+
self.assertTrue(all(isinstance(v, StringVariable) for v in table.domain.metas))
422+
405423
def test_time_variable_compatible(self):
406424
from Orange.data.pandas_compat import table_from_frame
407425

0 commit comments

Comments
 (0)