Skip to content

Commit 0f5025a

Browse files
authored
Merge pull request #6058 from janezd/table-get-column
[ENH] Table: Add methods get_column and set_column
2 parents 541be8c + 3a549a2 commit 0f5025a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+488
-200
lines changed

Orange/data/domain.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def match(var):
6969
sourceindex = source.index(sourcevar)
7070
if var.is_discrete and var is not sourcevar:
7171
mapping = var.get_mapper_from(sourcevar)
72-
return lambda table: mapping(table.get_column_view(sourceindex)[0])
72+
return lambda table: mapping(table.get_column(sourceindex))
7373
return source.index(var)
7474
return var.compute_value # , which may also be None
7575

Orange/data/table.py

Lines changed: 113 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from itertools import chain
1212
from numbers import Real, Integral
1313
from threading import Lock
14-
from typing import List, TYPE_CHECKING
14+
from typing import List, TYPE_CHECKING, Union
1515

1616
import bottleneck as bn
1717
import numpy as np
@@ -208,8 +208,19 @@ def __init__(self, domain):
208208
setattr(self, v.name.replace(" ", "_"), v)
209209

210210

211-
class _ArrayConversion:
211+
def _compute_column(func, *args, **kwargs):
212+
col = func(*args, **kwargs)
213+
if isinstance(col, np.ndarray) and col.ndim != 1:
214+
err = f"{type(col)} must return a column, not {col.ndim}d array"
215+
if col.ndim == 2:
216+
warnings.warn(err)
217+
col = col.reshape(-1)
218+
else:
219+
raise ValueError(err)
220+
return col
212221

222+
223+
class _ArrayConversion:
213224
def __init__(self, target, src_cols, variables, is_sparse, source_domain):
214225
self.target = target
215226
self.src_cols = src_cols
@@ -300,9 +311,9 @@ def get_columns(self, source, row_indices, n_rows, out=None, target_indices=None
300311
shared = col.compute_shared(sourceri)
301312
_idcache_save(shared_cache, (col.compute_shared, source), shared)
302313
col_array = match_density(
303-
col(sourceri, shared_data=shared))
314+
_compute_column(col, sourceri, shared_data=shared))
304315
else:
305-
col_array = match_density(col(sourceri))
316+
col_array = match_density(_compute_column(col, sourceri))
306317
elif col < 0:
307318
col_array = match_density(
308319
source.metas[row_indices, -1 - col]
@@ -1425,7 +1436,7 @@ def add_column(self, variable, data, to_metas=None):
14251436
domain = Domain(attrs, classes, metavars)
14261437
new_table = self.transform(domain)
14271438
with new_table.unlocked(new_table.metas if to_metas else new_table.X):
1428-
new_table.get_column_view(variable)[0][:] = data
1439+
new_table.set_column(variable, data)
14291440
return new_table
14301441

14311442
def is_sparse(self):
@@ -1554,43 +1565,114 @@ def shuffle(self):
15541565
self.W = self.W[ind]
15551566
self.ids = self.ids[ind]
15561567

1557-
def get_column_view(self, index):
1568+
@deprecated("Table.get_column (or Table.set_column if you must)")
1569+
def get_column_view(self, index: Union[Integral, Variable]) -> np.ndarray:
15581570
"""
1559-
Return a vector - as a view, not a copy - with a column of the table,
1560-
and a bool flag telling whether this column is sparse. Note that
1561-
vertical slicing of sparse matrices is inefficient.
1571+
An obsolete function that was supposed to return a view with a column
1572+
of the table, and a bool flag telling whether this column is sparse.
1573+
1574+
The function *sometimes* returns a copy. This happens if the variable
1575+
is computed or if values of discrete attribute need to be remapped due
1576+
to different encoding.
1577+
1578+
Note that vertical slicing of sparse matrices is inefficient.
15621579
15631580
:param index: the index of the column
15641581
:type index: int, str or Orange.data.Variable
15651582
:return: (one-dimensional numpy array, sparse)
15661583
"""
1567-
1568-
def rx(M):
1569-
if sp.issparse(M):
1570-
return np.asarray(M.todense())[:, 0], True
1571-
else:
1572-
return M, False
1573-
15741584
if isinstance(index, Integral):
15751585
col_index = index
15761586
else:
15771587
col_index = self.domain.index(index)
1578-
if col_index >= 0:
1579-
if col_index < self.X.shape[1]:
1580-
col = rx(self.X[:, col_index])
1581-
elif self._Y.ndim == 1 and col_index == self._X.shape[1]:
1582-
col = rx(self._Y)
1583-
else:
1584-
col = rx(self._Y[:, col_index - self.X.shape[1]])
1585-
else:
1586-
col = rx(self.metas[:, -1 - col_index])
1588+
col = self._get_column_view(col_index)
1589+
1590+
sparse = sp.issparse(col)
1591+
if sparse:
1592+
# `index` below can be integer or a Variable
1593+
warnings.warn("get_column_view is returning a dense copy column "
1594+
f"{index}")
1595+
col = np.asarray(col.todense())[:, 0]
15871596

15881597
if isinstance(index, DiscreteVariable) \
15891598
and index.values != self.domain[col_index].values:
1590-
col = index.get_mapper_from(self.domain[col_index])(col[0]), col[1]
1591-
col[0].flags.writeable = False
1599+
col = index.get_mapper_from(self.domain[col_index])(col)
1600+
col.flags.writeable = False
1601+
warnings.warn("get_column_view is returning a mapped copy of "
1602+
f"column {index.name}")
1603+
return col, sparse
1604+
1605+
def _get_column_view(self, index: Integral) -> np.ndarray:
1606+
if index >= 0:
1607+
if index < self.X.shape[1]:
1608+
return self.X[:, index]
1609+
elif self._Y.ndim == 1 and index == self._X.shape[1]:
1610+
return self._Y
1611+
else:
1612+
return self._Y[:, index - self.X.shape[1]]
1613+
else:
1614+
return self.metas[:, -1 - index]
1615+
1616+
def get_column(self, index, copy=False):
1617+
"""
1618+
Return a column with values of `index`.
1619+
1620+
If `index` is an instance of variable that does not exist in the domain
1621+
but has `compute_value`, `get_column` calls `compute_value`. Otherwise,
1622+
it returns a view into the table unless `copy` is set to `True`.
1623+
1624+
Args:
1625+
index (int or str or Variable): attribute
1626+
copy (bool): if set to True, ensure the result is a copy, not a view
1627+
1628+
Returns:
1629+
column (np.array): data column
1630+
"""
1631+
if isinstance(index, Variable) and index not in self.domain:
1632+
if index.compute_value is None:
1633+
raise ValueError(f"variable {index.name} is not in domain")
1634+
return _compute_column(index.compute_value, self)
1635+
1636+
mapper = None
1637+
if not isinstance(index, Integral):
1638+
if isinstance(index, DiscreteVariable) \
1639+
and index.values != self.domain[index].values:
1640+
mapper = index.get_mapper_from(self.domain[index])
1641+
index = self.domain.index(index)
1642+
1643+
col = self._get_column_view(index)
1644+
if sp.issparse(col):
1645+
col = col.toarray().reshape(-1)
1646+
if col.dtype == object and self.domain[index].is_primitive():
1647+
col = col.astype(np.float64)
1648+
if mapper is not None:
1649+
col = mapper(col)
1650+
if copy and col.base is not None:
1651+
col = col.copy()
15921652
return col
15931653

1654+
def set_column(self, index: Union[int, str, Variable], data):
1655+
"""
1656+
Set the values in the given column do `data`.
1657+
1658+
This function may be useful, but try avoiding it.
1659+
1660+
Table (or the corresponding
1661+
part must be unlocked). If variable is discrete, its encoding must
1662+
match the variable in the domain.
1663+
1664+
Args:
1665+
index (int, str, Variable): index of a column
1666+
data (object): a single value or 1d array of length len(self)
1667+
"""
1668+
if not isinstance(index, Integral):
1669+
if isinstance(index, DiscreteVariable) \
1670+
and self.domain[index].values != index.values:
1671+
raise ValueError(f"cannot set data for variable {index.name} "
1672+
"with different encoding")
1673+
index = self.domain.index(index)
1674+
self._get_column_view(index)[:] = data
1675+
15941676
def _filter_is_defined(self, columns=None, negate=False):
15951677
# structure of function is obvious; pylint: disable=too-many-branches
15961678
def _sp_anynan(a):
@@ -1620,10 +1702,8 @@ def _sp_anynan(a):
16201702
else:
16211703
remove = np.zeros(len(self), dtype=bool)
16221704
for column in columns:
1623-
col, sparse = self.get_column_view(column)
1624-
if sparse:
1625-
remove += col == 0
1626-
elif self.domain[column].is_primitive():
1705+
col = self.get_column(column)
1706+
if self.domain[column].is_primitive():
16271707
remove += bn.anynan([col.astype(float)], axis=0)
16281708
else:
16291709
remove += col.astype(bool)
@@ -1650,7 +1730,7 @@ def _filter_has_class(self, negate=False):
16501730
def _filter_same_value(self, column, value, negate=False):
16511731
if not isinstance(value, Real):
16521732
value = self.domain[column].to_val(value)
1653-
sel = self.get_column_view(column)[0] == value
1733+
sel = self.get_column(column) == value
16541734
if negate:
16551735
sel = np.logical_not(sel)
16561736
return self.from_table_rows(self, sel)
@@ -1736,7 +1816,7 @@ def get_col_indices():
17361816
raise TypeError("Invalid filter")
17371817

17381818
def col_filter(col_idx):
1739-
col = self.get_column_view(col_idx)[0]
1819+
col = self.get_column(col_idx)
17401820
if isinstance(filter, IsDefined):
17411821
if self.domain[col_idx].is_primitive():
17421822
return ~np.isnan(col.astype(float))

0 commit comments

Comments
 (0)