Skip to content

Commit 8cb7f58

Browse files
authored
Merge branch 'master' into table-get-column
2 parents 4b8aa55 + 4788c6e commit 8cb7f58

File tree

9 files changed

+899
-232
lines changed

9 files changed

+899
-232
lines changed

Orange/widgets/data/owaggregatecolumns.py

Lines changed: 163 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,30 @@
1-
from typing import List
1+
from itertools import chain
2+
from typing import List, NamedTuple, Callable
23

34
import numpy as np
45

5-
from AnyQt.QtWidgets import QSizePolicy
6+
from AnyQt.QtWidgets import QSizePolicy, QStyle, \
7+
QButtonGroup, QRadioButton, QComboBox
68
from AnyQt.QtCore import Qt
9+
710
from Orange.data import Variable, Table, ContinuousVariable, TimeVariable
811
from Orange.data.util import get_unique_names
912
from Orange.widgets import gui, widget
1013
from Orange.widgets.settings import (
1114
ContextSetting, Setting, DomainContextHandler
1215
)
16+
from Orange.widgets.utils.signals import AttributeList
1317
from Orange.widgets.utils.widgetpreview import WidgetPreview
1418
from Orange.widgets.widget import Input, Output
1519
from Orange.widgets.utils.itemmodels import DomainModel
1620

1721

22+
class OpDesc(NamedTuple):
23+
name: str
24+
func: Callable[[np.ndarray], np.ndarray]
25+
time_preserving: bool = False
26+
27+
1828
class OWAggregateColumns(widget.OWWidget):
1929
name = "Aggregate Columns"
2030
description = "Compute a sum, max, min ... of selected columns."
@@ -26,53 +36,84 @@ class OWAggregateColumns(widget.OWWidget):
2636

2737
class Inputs:
2838
data = Input("Data", Table, default=True)
39+
features = Input("Features", AttributeList)
2940

3041
class Outputs:
3142
data = Output("Data", Table)
3243

44+
class Warning(widget.OWWidget.Warning):
45+
discrete_features = widget.Msg("Some input features are categorical:\n{}")
46+
missing_features = widget.Msg("Some input features are missing:\n{}")
47+
3348
want_main_area = False
3449

50+
Operations = {"Sum": OpDesc("Sum", np.nansum),
51+
"Product": OpDesc("Product", np.nanprod),
52+
"Min": OpDesc("Minimal value", np.nanmin, True),
53+
"Max": OpDesc("Maximal value", np.nanmax, True),
54+
"Mean": OpDesc("Mean value", np.nanmean, True),
55+
"Variance": OpDesc("Variance", np.nanvar),
56+
"Median": OpDesc("Median", np.nanmedian, True)}
57+
KeyFromDesc = {op.name: key for key, op in Operations.items()}
58+
59+
SelectAll, SelectAllAndMeta, InputFeatures, SelectManually = range(4)
60+
3561
settingsHandler = DomainContextHandler()
3662
variables: List[Variable] = ContextSetting([])
37-
operation = Setting("Sum")
38-
var_name = Setting("agg")
63+
selection_method: int = Setting(SelectManually, schema_only=True)
64+
operation = ContextSetting("Sum")
65+
var_name = Setting("agg", schema_only=True)
3966
auto_apply = Setting(True)
4067

41-
Operations = {"Sum": np.nansum, "Product": np.nanprod,
42-
"Min": np.nanmin, "Max": np.nanmax,
43-
"Mean": np.nanmean, "Variance": np.nanvar,
44-
"Median": np.nanmedian}
45-
TimePreserving = ("Min", "Max", "Mean", "Median")
46-
4768
def __init__(self):
4869
super().__init__()
4970
self.data = None
71+
self.features = None
5072

51-
box = gui.vBox(self.controlArea, box=True)
73+
self.selection_box = gui.vBox(self.controlArea, "Variable selection")
74+
self.selection_group = QButtonGroup(self.selection_box)
75+
for i, label in enumerate(("All",
76+
"All, including meta attributes",
77+
"Features from separate input signal",
78+
"Selected variables")):
79+
button = QRadioButton(label)
80+
if i == self.selection_method:
81+
button.setChecked(True)
82+
self.selection_group.addButton(button, id=i)
83+
self.selection_box.layout().addWidget(button)
84+
self.selection_group.idClicked.connect(self._on_sel_method_changed)
5285

5386
self.variable_model = DomainModel(
54-
order=DomainModel.MIXED, valid_types=(ContinuousVariable, ))
87+
order=(DomainModel.ATTRIBUTES, DomainModel.METAS),
88+
valid_types=ContinuousVariable)
89+
pixm: QStyle = self.style().pixelMetric
90+
ind_width = pixm(QStyle.PM_ExclusiveIndicatorWidth) + \
91+
pixm(QStyle.PM_RadioButtonLabelSpacing)
5592
var_list = gui.listView(
56-
box, self, "variables", model=self.variable_model,
93+
gui.indentedBox(self.selection_box, ind_width), self, "variables",
94+
model=self.variable_model,
5795
callback=self.commit.deferred
5896
)
5997
var_list.setSelectionMode(var_list.ExtendedSelection)
6098

61-
combo = gui.comboBox(
62-
box, self, "operation",
63-
label="Operator: ", orientation=Qt.Horizontal,
64-
items=list(self.Operations), sendSelectedValue=True,
65-
callback=self.commit.deferred
66-
)
67-
combo.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed)
99+
box = gui.vBox(self.controlArea, box="Operation")
100+
combo = self.operation_combo = QComboBox()
101+
combo.addItems([op.name for op in self.Operations.values()])
102+
combo.textActivated[str].connect(self._on_operation_changed)
103+
combo.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Fixed)
104+
combo.setCurrentText(self.Operations[self.operation].name)
105+
box.layout().addWidget(combo)
68106

69107
gui.lineEdit(
70108
box, self, "var_name",
71-
label="Variable name: ", orientation=Qt.Horizontal,
109+
label="Output variable name: ", orientation=Qt.Horizontal,
72110
callback=self.commit.deferred
73111
)
74112

75-
gui.auto_apply(self.controlArea, self)
113+
gui.auto_apply(self.buttonsArea, self)
114+
115+
self._update_selection_buttons()
116+
76117

77118
@Inputs.data
78119
def set_data(self, data: Table = None):
@@ -82,56 +123,138 @@ def set_data(self, data: Table = None):
82123
if self.data:
83124
self.variable_model.set_domain(data.domain)
84125
self.openContext(data)
126+
self.operation_combo.setCurrentText(self.Operations[self.operation].name)
85127
else:
86128
self.variable_model.set_domain(None)
129+
130+
@Inputs.features
131+
def set_features(self, features):
132+
if features is None:
133+
self.features = None
134+
missing = []
135+
else:
136+
self.features = [attr for attr in features if attr.is_continuous]
137+
missing = self._missing(features, self.features)
138+
self.Warning.discrete_features(missing, shown=bool(missing))
139+
140+
def _update_selection_buttons(self):
141+
if self.features is not None:
142+
for i, button in enumerate(self.selection_group.buttons()):
143+
button.setChecked(i == self.InputFeatures)
144+
button.setEnabled(i == self.InputFeatures)
145+
self.controls.variables.setEnabled(False)
146+
else:
147+
for i, button in enumerate(self.selection_group.buttons()):
148+
button.setChecked(i == self.selection_method)
149+
button.setEnabled(i != self.InputFeatures)
150+
self.controls.variables.setEnabled(
151+
self.selection_method == self.SelectManually)
152+
153+
def handleNewSignals(self):
154+
self._update_selection_buttons()
87155
self.commit.now()
88156

157+
def _on_sel_method_changed(self, i):
158+
self.selection_method = i
159+
self._update_selection_buttons()
160+
self.commit.deferred()
161+
162+
def _on_operation_changed(self, oper):
163+
self.operation = self.KeyFromDesc[oper]
164+
self.commit.deferred()
165+
89166
@gui.deferred
90167
def commit(self):
91168
augmented = self._compute_data()
92169
self.Outputs.data.send(augmented)
93170

94171
def _compute_data(self):
95-
if not self.data or not self.variables:
172+
self.Warning.missing_features.clear()
173+
if not self.data:
174+
return self.data
175+
176+
variables = self._variables()
177+
if not self.data or not variables:
96178
return self.data
97179

98-
new_col = self._compute_column()
99-
new_var = self._new_var()
180+
new_col = self._compute_column(variables)
181+
new_var = self._new_var(variables)
100182
return self.data.add_column(new_var, new_col)
101183

102-
def _compute_column(self):
103-
arr = np.empty((len(self.data), len(self.variables)))
104-
for i, var in enumerate(self.variables):
184+
def _variables(self):
185+
self.Warning.missing_features.clear()
186+
if self.features is not None:
187+
selected = [attr for attr in self.features
188+
if attr in self.data.domain]
189+
missing = self._missing(self.features, selected)
190+
self.Warning.missing_features(missing, shown=bool(missing))
191+
return selected
192+
193+
assert self.data
194+
195+
domain = self.data.domain
196+
if self.selection_method == self.SelectAll:
197+
return [attr for attr in domain.attributes
198+
if attr.is_continuous]
199+
if self.selection_method == self.SelectAllAndMeta:
200+
# skip separators
201+
return [attr for attr in chain(domain.attributes, domain.metas)
202+
if attr.is_continuous]
203+
204+
assert self.selection_method == self.SelectManually
205+
return self.variables
206+
207+
def _compute_column(self, variables):
208+
arr = np.empty((len(self.data), len(variables)))
209+
for i, var in enumerate(variables):
105210
arr[:, i] = self.data.get_column(var)
106-
func = self.Operations[self.operation]
211+
func = self.Operations[self.operation].func
107212
return func(arr, axis=1)
108213

109214
def _new_var_name(self):
110215
return get_unique_names(self.data.domain, self.var_name)
111216

112-
def _new_var(self):
217+
def _new_var(self, variables):
113218
name = self._new_var_name()
114-
if self.operation in self.TimePreserving \
115-
and all(isinstance(var, TimeVariable) for var in self.variables):
219+
if self.Operations[self.operation].time_preserving \
220+
and all(isinstance(var, TimeVariable) for var in variables):
116221
return TimeVariable(name)
117222
return ContinuousVariable(name)
118223

119224
def send_report(self):
120-
# fp for self.variables, pylint: disable=unsubscriptable-object
121-
if not self.data or not self.variables:
225+
if not self.data:
122226
return
123-
var_list = ", ".join(f"'{var.name}'"
124-
for var in self.variables[:31][:-1])
125-
if len(self.variables) > 30:
126-
var_list += f" and {len(self.variables) - 30} others"
127-
else:
128-
var_list += f" and '{self.variables[-1].name}'"
227+
variables = self._variables()
228+
if not variables:
229+
return
230+
var_list = self._and_others(variables, 30)
129231
self.report_items((
130232
("Output:",
131233
f"'{self._new_var_name()}' as {self.operation.lower()} of {var_list}"
132234
),
133235
))
134236

237+
@staticmethod
238+
def _and_others(variables, limit):
239+
if len(variables) == 1:
240+
return f"'{variables[0].name}'"
241+
var_list = ", ".join(f"'{var.name}'"
242+
for var in variables[:limit + 1][:-1])
243+
if len(variables) > limit:
244+
var_list += f" and {len(variables) - limit} more"
245+
else:
246+
var_list += f" and '{variables[-1].name}'"
247+
return var_list
248+
249+
@classmethod
250+
def _missing(cls, given, used):
251+
if len(given) == len(used):
252+
return ""
253+
used = set(used)
254+
# Don't use set difference because it loses order
255+
missing = [attr for attr in given if attr not in used]
256+
return cls._and_others(missing, 5)
257+
135258

136259
if __name__ == "__main__": # pragma: no cover
137260
brown = Table("brown-selected")

Orange/widgets/data/owfeaturestatistics.py

Lines changed: 37 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -290,52 +290,45 @@ def __mode(x, *args, **kwargs):
290290
time_f=lambda x: ut.nanmedian(x, axis=0),
291291
)
292292

293-
def get_statistics_matrix(self, variables=None, return_labels=False):
294-
"""Get the numeric computed statistics in a single matrix. Optionally,
295-
we can specify for which variables we want the stats. Also, we can get
296-
the string column names as labels if desired.
297-
298-
Parameters
299-
----------
300-
variables : Iterable[Union[Variable, int, str]]
301-
Return statistics for only the variables specified. Accepts all
302-
formats supported by `domain.index`
303-
return_labels : bool
304-
In addition to the statistics matrix, also return string labels for
305-
the columns of the matrix e.g. 'Mean' or 'Dispersion', as specified
306-
in `Columns`.
307-
308-
Returns
309-
-------
310-
Union[Tuple[List[str], np.ndarray], np.ndarray]
311-
312-
"""
313-
if self.table is None:
314-
return np.atleast_2d([])
293+
def get_statistics_table(self):
294+
"""Get the numeric computed statistics in a single matrix."""
295+
if self.table is None or not self.rowCount():
296+
return None
315297

316-
# If a list of variables is given, select only corresponding stats
317-
# variables can be a list or array, pylint: disable=len-as-condition
318-
if variables is not None and len(variables) != 0:
319-
indices = [self.domain.index(var) for var in variables]
298+
# don't match TimeVariable, pylint: disable=unidiomatic-typecheck
299+
contivars = [type(var) is ContinuousVariable for var in self.variables]
300+
if any(contivars):
301+
def c(column):
302+
return np.choose(contivars, [np.nan, column])
303+
304+
x = np.vstack((
305+
c(self._center), c(self._median), self._dispersion,
306+
c(self._min), c(self._max), self._missing,
307+
)).T
308+
attrs = [ContinuousVariable(column.name) for column in (
309+
self.Columns.CENTER, self.Columns.MEDIAN,
310+
self.Columns.DISPERSION,
311+
self.Columns.MIN, self.Columns.MAX, self.Columns.MISSING)]
320312
else:
321-
indices = ...
322-
323-
matrix = np.vstack((
324-
self._center[indices], self._median[indices],
325-
self._dispersion[indices],
326-
self._min[indices], self._max[indices], self._missing[indices],
327-
)).T
328-
329-
# Return string labels for the returned matrix columns e.g. 'Mean',
330-
# 'Dispersion' if requested
331-
if return_labels:
332-
labels = [self.Columns.CENTER.name, self.Columns.MEDIAN.name,
333-
self.Columns.DISPERSION.name,
334-
self.Columns.MIN.name, self.Columns.MAX.name,
335-
self.Columns.MISSING.name]
336-
return labels, matrix
313+
x = np.vstack((self._dispersion, self._missing)).T
314+
attrs = [ContinuousVariable(name)
315+
for name in ("Entropy", self.Columns.MISSING.name)]
316+
317+
names = [var.name for var in self.variables]
318+
if any(isinstance(var, DiscreteVariable) for var in self.variables):
319+
majorities = [
320+
var.str_val(val) if isinstance(var, DiscreteVariable) else ""
321+
for var, val in zip(self.variables, self._median)]
322+
metas = np.vstack((names, majorities)).T
323+
meta_attrs = [StringVariable('Feature'), StringVariable('Mode')]
324+
else:
325+
metas = np.atleast_2d(names).T
326+
meta_attrs = [StringVariable('Feature')]
337327

338-
return matrix
328+
domain = Domain(attributes=attrs, metas=meta_attrs)
329+
statistics = Table.from_numpy(domain, x, metas=metas)
330+
statistics.name = f'{self.table.name} (Feature Statistics)'
331+
return statistics
339332

340333
def __compute_stat(self, matrices, discrete_f=None, continuous_f=None,
341334
time_f=None, string_f=None, default_val=np.nan):
@@ -866,14 +859,7 @@ def commit(self):
866859
return
867860

868861
# Send the statistics of the selected variables to ouput
869-
labels, data = self.model.get_statistics_matrix(return_labels=True)
870-
var_names = np.atleast_2d([var.name for var in self.model.variables]).T
871-
domain = Domain(
872-
attributes=[ContinuousVariable(name) for name in labels],
873-
metas=[StringVariable('Feature')]
874-
)
875-
statistics = Table(domain, data, metas=var_names)
876-
statistics.name = '{self.data.name} (Feature Statistics)'
862+
statistics = self.model.get_statistics_table()
877863
self.Outputs.statistics.send(statistics)
878864

879865
def send_report(self):

0 commit comments

Comments
 (0)