diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py index 94207b2ec62..08829eab0a0 100644 --- a/Orange/statistics/util.py +++ b/Orange/statistics/util.py @@ -5,6 +5,7 @@ It also patches bottleneck to contain these functions. """ import warnings +import math import numpy as np import bottleneck as bn @@ -604,3 +605,61 @@ def var(x, axis=None, ddof=0): def std(x, axis=None, ddof=0): """ Equivalent of np.std that supports sparse and dense matrices. """ return np.sqrt(var(x, axis=axis, ddof=ddof)) + + +# to speed-up FDR, calculate ahead sum([1/i for i in range(1, m+1)]), +# for m in [1,100000]. +# For higher values of m use an approximation, with error less or equal to +# 4.99999157277e-006. (sum([1/i for i in range(1, m+1)]) ~ log(m) + 0.5772..., +# 0.5572 is an Euler-Mascheroni constant) +c = [1.0] +for m in range(2, 100000): + c.append(c[-1] + 1.0/m) + + +def FDR(p_values, dependent=False, m=None, ordered=False): + """ `False Discovery Rate `_ + correction on a list of p-values. + + :param p_values: a list of p-values. + :param dependent: use correction for dependent hypotheses (default False). + :param m: number of hypotheses tested (default ``len(p_values)``). + :param ordered: prevent sorting of p-values if they are already sorted + (default False). + """ + def is_sorted(l): + return all(l[i] <= l[i + 1] for i in range(len(l) - 1)) + + if not ordered: + ordered = is_sorted(p_values) + + if not ordered: + joined = [(v, i) for i, v in enumerate(p_values)] + joined.sort() + p_values = [p[0] for p in joined] + indices = [p[1] for p in joined] + + if not m: + m = len(p_values) + if m <= 0 or not p_values: + return [] + + if dependent: # correct q for dependent tests + k = c[m-1] if m <= len(c) else math.log(m) + 0.57721566490153286060651209008240243104215933593992 + m = m * k + + tmp_fdrs = [p*m/(i+1.0) for (i, p) in enumerate(p_values)] + fdrs = [] + cmin = tmp_fdrs[-1] + for f in reversed(tmp_fdrs): + cmin = min(f, cmin) + fdrs.append( cmin) + fdrs.reverse() + + if not ordered: + new = [None] * len(fdrs) + for v, i in zip(fdrs, indices): + new[i] = v + fdrs = new + + return fdrs diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py index 722025774d3..391c73273a5 100644 --- a/Orange/tests/test_statistics.py +++ b/Orange/tests/test_statistics.py @@ -9,7 +9,7 @@ from Orange.statistics.util import bincount, countnans, contingency, digitize, \ mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \ - unique, var, nanstd, nanvar, nanmode, array_equal + unique, var, nanstd, nanvar, nanmode, array_equal, FDR from sklearn.utils import check_random_state @@ -263,6 +263,12 @@ def test_nanstd_with_ddof(self): nanstd(csr_matrix(x), axis=axis, ddof=10), ) + def test_FDR(self): + p_values = [0.00001, 0.0001, 0.0002, 0.0003, 0.0004] + np.testing.assert_almost_equal( + np.array([0.00005, 0.00025, 0.00033, 0.00038, 0.0004]), + FDR(p_values), decimal=5) + class TestNanmean(unittest.TestCase): def setUp(self): diff --git a/Orange/widgets/data/owcorrelations.py b/Orange/widgets/data/owcorrelations.py index d9f2525a07b..3b00615f845 100644 --- a/Orange/widgets/data/owcorrelations.py +++ b/Orange/widgets/data/owcorrelations.py @@ -9,14 +9,19 @@ from scipy.stats import spearmanr, pearsonr from sklearn.cluster import KMeans -from AnyQt.QtCore import Qt, QItemSelectionModel, QItemSelection, QSize -from AnyQt.QtGui import QStandardItem, QColor +from AnyQt.QtCore import Qt, QItemSelectionModel, QItemSelection, \ + QSize, pyqtSignal as Signal +from AnyQt.QtGui import QStandardItem +from AnyQt.QtWidgets import QHeaderView from Orange.data import Table, Domain, ContinuousVariable, StringVariable from Orange.preprocess import SklImpute, Normalize +from Orange.statistics.util import FDR from Orange.widgets import gui from Orange.widgets.settings import Setting, ContextSetting, \ DomainContextHandler +from Orange.widgets.utils import vartype +from Orange.widgets.utils.itemmodels import DomainModel from Orange.widgets.utils.signals import Input, Output from Orange.widgets.utils.widgetpreview import WidgetPreview from Orange.widgets.visualize.utils import VizRankDialogAttrPair @@ -85,28 +90,33 @@ class CorrelationRank(VizRankDialogAttrPair): """ Correlations rank widget. """ - NEGATIVE_COLOR = QColor(70, 190, 250) - POSITIVE_COLOR = QColor(170, 242, 43) + threadStopped = Signal() + PValRole = next(gui.OrangeUserRole) def __init__(self, *args): super().__init__(*args) self.heuristic = None self.use_heuristic = False + self.sel_feature_index = None def initialize(self): super().initialize() data = self.master.cont_data self.attrs = data and data.domain.attributes self.model_proxy.setFilterKeyColumn(-1) - self.rank_table.horizontalHeader().setStretchLastSection(False) self.heuristic = None self.use_heuristic = False + if self.master.feature is not None: + self.sel_feature_index = data.domain.index(self.master.feature) + else: + self.sel_feature_index = None if data: # use heuristic if data is too big n_attrs = len(self.attrs) use_heuristic = n_attrs > KMeansCorrelationHeuristic.n_clusters self.use_heuristic = use_heuristic and \ - len(data) * n_attrs ** 2 > SIZE_LIMIT + len(data) * n_attrs ** 2 > SIZE_LIMIT and \ + self.sel_feature_index is None if self.use_heuristic: self.heuristic = KMeansCorrelationHeuristic(data) @@ -114,33 +124,46 @@ def compute_score(self, state): (attr1, attr2), corr_type = state, self.master.correlation_type data = self.master.cont_data.X corr = pearsonr if corr_type == CorrelationType.PEARSON else spearmanr - result = corr(data[:, attr1], data[:, attr2])[0] - return -abs(result) if not np.isnan(result) else NAN, result + r, p_value = corr(data[:, attr1], data[:, attr2]) + return -abs(r) if not np.isnan(r) else NAN, r, p_value def row_for_state(self, score, state): attrs = sorted((self.attrs[x] for x in state), key=attrgetter("name")) - attrs_item = QStandardItem( - "{}, {}".format(attrs[0].name, attrs[1].name)) - attrs_item.setData(attrs, self._AttrRole) - attrs_item.setData(Qt.AlignLeft + Qt.AlignTop, Qt.TextAlignmentRole) + attr_items = [] + for attr in attrs: + item = QStandardItem(attr.name) + item.setData(attrs, self._AttrRole) + item.setData(Qt.AlignLeft + Qt.AlignTop, Qt.TextAlignmentRole) + item.setToolTip(attr.name) + attr_items.append(item) correlation_item = QStandardItem("{:+.3f}".format(score[1])) + correlation_item.setData(score[2], self.PValRole) correlation_item.setData(attrs, self._AttrRole) correlation_item.setData( self.NEGATIVE_COLOR if score[1] < 0 else self.POSITIVE_COLOR, gui.TableBarItem.BarColorRole) - return [correlation_item, attrs_item] + return [correlation_item] + attr_items def check_preconditions(self): return self.master.cont_data is not None def iterate_states(self, initial_state): - if self.use_heuristic: + if self.sel_feature_index is not None: + return self.iterate_states_by_feature() + elif self.use_heuristic: return self.heuristic.get_states(initial_state) else: return super().iterate_states(initial_state) + def iterate_states_by_feature(self): + for j in range(len(self.attrs)): + if j != self.sel_feature_index: + yield self.sel_feature_index, j + def state_count(self): - if self.use_heuristic: + if self.sel_feature_index is not None: + return len(self.attrs) - 1 + elif self.use_heuristic: n_clusters = KMeansCorrelationHeuristic.n_clusters n_avg_attrs = len(self.attrs) / n_clusters return n_clusters * n_avg_attrs * (n_avg_attrs - 1) / 2 @@ -152,6 +175,11 @@ def state_count(self): def bar_length(score): return abs(score[1]) + def stopped(self): + self.threadStopped.emit() + header = self.rank_table.horizontalHeader() + header.setSectionResizeMode(1, QHeaderView.Stretch) + class OWCorrelations(OWWidget): name = "Correlations" @@ -169,8 +197,10 @@ class Outputs: want_control_area = False + settings_version = 2 settingsHandler = DomainContextHandler() selection = ContextSetting(()) + feature = ContextSetting(None) correlation_type = Setting(0) class Information(OWWidget.Information): @@ -186,12 +216,23 @@ def __init__(self): box = gui.vBox(self.mainArea) self.correlation_combo = gui.comboBox( box, self, "correlation_type", items=CorrelationType.items(), - orientation=Qt.Horizontal, callback=self._correlation_combo_changed) + orientation=Qt.Horizontal, callback=self._correlation_combo_changed + ) + + self.feature_model = DomainModel( + separators=False, placeholder="(All combinations)", + valid_types=ContinuousVariable, + ) + gui.comboBox( + box, self, "feature", callback=self._feature_combo_changed, + model=self.feature_model + ) self.vizrank, _ = CorrelationRank.add_vizrank( None, self, None, self._vizrank_selection_changed) self.vizrank.progressBar = self.progressBar self.vizrank.button.setEnabled(False) + self.vizrank.threadStopped.connect(self._vizrank_stopped) gui.separator(box) box.layout().addWidget(self.vizrank.filter) @@ -206,22 +247,41 @@ def sizeHint(self): def _correlation_combo_changed(self): self.apply() + def _feature_combo_changed(self): + self.apply() + def _vizrank_selection_changed(self, *args): - self.selection = args + self.selection = [(var.name, vartype(var)) for var in args] self.commit() + def _vizrank_stopped(self): + self._vizrank_select() + def _vizrank_select(self): model = self.vizrank.rank_table.model() + if not model.rowCount(): + return selection = QItemSelection() - names = sorted(x.name for x in self.selection) - for i in range(model.rowCount()): - # pylint: disable=protected-access - if sorted(x.name for x in model.data( - model.index(i, 0), CorrelationRank._AttrRole)) == names: - selection.select(model.index(i, 0), model.index(i, 1)) - self.vizrank.rank_table.selectionModel().select( - selection, QItemSelectionModel.ClearAndSelect) - break + + # This flag is needed because data in the model could be + # filtered by a feature and therefore selection could not be found + selection_in_model = False + if self.selection: + sel_names = sorted(name for name, _ in self.selection) + for i in range(model.rowCount()): + # pylint: disable=protected-access + names = sorted(x.name for x in model.data( + model.index(i, 0), CorrelationRank._AttrRole)) + if names == sel_names: + selection.select(model.index(i, 0), + model.index(i, model.columnCount() - 1)) + selection_in_model = True + break + if not selection_in_model: + selection.select(model.index(0, 0), + model.index(0, model.columnCount() - 1)) + self.vizrank.rank_table.selectionModel().select( + selection, QItemSelectionModel.ClearAndSelect) @Inputs.data def set_data(self, data): @@ -240,18 +300,20 @@ def set_data(self, data): domain = data.domain cont_dom = Domain(cont_attrs, domain.class_vars, domain.metas) self.cont_data = SklImpute()(Table.from_table(cont_dom, data)) + self.set_feature_model() + self.openContext(self.cont_data) self.apply() - self.openContext(self.data) - self._vizrank_select() - self.vizrank.button.setEnabled(self.data is not None) + self.vizrank.button.setEnabled(self.cont_data is not None) + + def set_feature_model(self): + self.feature_model.set_domain(self.cont_data and self.cont_data.domain) + self.feature = None def apply(self): self.vizrank.initialize() if self.cont_data is not None: # this triggers self.commit() by changing vizrank selection self.vizrank.toggle() - header = self.vizrank.rank_table.horizontalHeader() - header.setStretchLastSection(True) else: self.commit() @@ -262,11 +324,14 @@ def commit(self): self.Outputs.correlations.send(None) return + attrs = [ContinuousVariable("Correlation"), ContinuousVariable("FDR")] metas = [StringVariable("Feature 1"), StringVariable("Feature 2")] - domain = Domain([ContinuousVariable("Correlation")], metas=metas) + domain = Domain(attrs, metas=metas) model = self.vizrank.rank_model - x = np.array([[float(model.data(model.index(row, 0)))] for row - in range(model.rowCount())]) + x = np.array([[float(model.data(model.index(row, 0), role)) + for role in (Qt.DisplayRole, CorrelationRank.PValRole)] + for row in range(model.rowCount())]) + x[:, 1] = FDR(list(x[:, 1])) # pylint: disable=protected-access m = np.array([[a.name for a in model.data(model.index(row, 0), CorrelationRank._AttrRole)] @@ -276,14 +341,21 @@ def commit(self): self.Outputs.data.send(self.data) # data has been imputed; send original attributes - self.Outputs.features.send(AttributeList([attr.compute_value.variable - for attr in self.selection])) + self.Outputs.features.send(AttributeList( + [self.data.domain[name] for name, _ in self.selection])) self.Outputs.correlations.send(corr_table) def send_report(self): self.report_table(CorrelationType.items()[self.correlation_type], self.vizrank.rank_table) + @classmethod + def migrate_context(cls, context, version): + if version < 2: + sel = context.values["selection"] + context.values["selection"] = ([(var.name, vartype(var)) + for var in sel[0]], sel[1]) + if __name__ == "__main__": # pragma: no cover WidgetPreview(OWCorrelations).run(Table("iris")) diff --git a/Orange/widgets/data/tests/test_owcorrelations.py b/Orange/widgets/data/tests/test_owcorrelations.py index dab9cca7147..45d3dd2b8aa 100644 --- a/Orange/widgets/data/tests/test_owcorrelations.py +++ b/Orange/widgets/data/tests/test_owcorrelations.py @@ -1,11 +1,17 @@ # Test methods with long descriptive names can omit docstrings # pylint: disable=missing-docstring, protected-access import time -from unittest.mock import patch +from unittest.mock import patch, Mock + +import numpy as np +import numpy.testing as npt + +from AnyQt.QtCore import Qt from Orange.data import Table from Orange.widgets.data.owcorrelations import ( - OWCorrelations, KMeansCorrelationHeuristic + OWCorrelations, KMeansCorrelationHeuristic, CorrelationRank, + CorrelationType ) from Orange.widgets.tests.base import WidgetTest from Orange.widgets.tests.utils import simulate @@ -30,7 +36,7 @@ def test_input_data_cont(self): time.sleep(0.1) n_attrs = len(self.data_cont.domain.attributes) self.process_events() - self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 2) + self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 3) self.assertEqual(self.widget.vizrank.rank_model.rowCount(), n_attrs * (n_attrs - 1) / 2) self.send_signal(self.widget.Inputs.data, None) @@ -52,7 +58,7 @@ def test_input_data_mixed(self): n_attrs = len([a for a in domain.attributes if a.is_continuous]) time.sleep(0.1) self.process_events() - self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 2) + self.assertEqual(self.widget.vizrank.rank_model.columnCount(), 3) self.assertEqual(self.widget.vizrank.rank_model.rowCount(), n_attrs * (n_attrs - 1) / 2) @@ -77,7 +83,6 @@ def test_output_data(self): self.send_signal(self.widget.Inputs.data, self.data_cont) time.sleep(0.1) self.process_events() - self.widget.commit() output = self.get_output(self.widget.Outputs.data) self.assertEqual(self.data_cont, output) @@ -86,8 +91,6 @@ def test_output_features(self): self.send_signal(self.widget.Inputs.data, self.data_cont) time.sleep(0.1) self.process_events() - attrs = self.widget.cont_data.domain.attributes - self.widget._vizrank_selection_changed(attrs[0], attrs[1]) features = self.get_output(self.widget.Outputs.features) self.assertIsInstance(features, AttributeList) self.assertEqual(len(features), 2) @@ -97,12 +100,45 @@ def test_output_correlations(self): self.send_signal(self.widget.Inputs.data, self.data_cont) time.sleep(0.1) self.process_events() - self.widget.commit() correlations = self.get_output(self.widget.Outputs.correlations) self.assertIsInstance(correlations, Table) self.assertEqual(len(correlations), 6) - self.assertEqual(len(correlations.domain.attributes), 1) self.assertEqual(len(correlations.domain.metas), 2) + self.assertListEqual(["Correlation", "FDR"], + [m.name for m in correlations.domain.attributes]) + array = np.array([[0.963, 0], [0.872, 0], [0.818, 0], [-0.421, 0], + [-0.357, 0.000009], [-0.109, 0.1827652]]) + npt.assert_almost_equal(correlations.X, array) + + def test_input_changed(self): + """Check whether changing input emits commit""" + self.widget.commit = Mock() + self.send_signal(self.widget.Inputs.data, self.data_cont) + time.sleep(0.1) + self.process_events() + self.widget.commit.assert_called_once() + + self.widget.commit.reset_mock() + self.send_signal(self.widget.Inputs.data, self.data_mixed) + time.sleep(0.1) + self.process_events() + self.widget.commit.assert_called_once() + + def test_saved_selection(self): + """Select row from settings""" + self.send_signal(self.widget.Inputs.data, self.data_cont) + time.sleep(0.1) + self.process_events() + attrs = self.widget.cont_data.domain.attributes + self.widget._vizrank_selection_changed(attrs[3], attrs[1]) + settings = self.widget.settingsHandler.pack_data(self.widget) + + w = self.create_widget(OWCorrelations, stored_settings=settings) + self.send_signal(self.widget.Inputs.data, self.data_cont, widget=w) + time.sleep(0.1) + self.process_events() + sel_row = w.vizrank.rank_table.selectionModel().selectedRows()[0].row() + self.assertEqual(sel_row, 4) def test_scatterplot_input_features(self): """Check if attributes have been set after sent to scatterplot""" @@ -137,16 +173,49 @@ def test_correlation_type(self): self.send_signal(self.widget.Inputs.data, self.data_cont) time.sleep(0.1) self.process_events() - self.widget.commit() pearson_corr = self.get_output(self.widget.Outputs.correlations) simulate.combobox_activate_item(c_type, "Spearman correlation") time.sleep(0.1) self.process_events() - self.widget.commit() sperman_corr = self.get_output(self.widget.Outputs.correlations) self.assertFalse((pearson_corr.X == sperman_corr.X).all()) + def test_feature_combo(self): + """Check content of feature selection combobox""" + feature_combo = self.widget.controls.feature + self.send_signal(self.widget.Inputs.data, self.data_mixed) + cont_attributes = [attr for attr in self.data_mixed.domain.attributes + if attr.is_continuous] + self.assertEqual(len(feature_combo.model()), len(cont_attributes) + 1) + + def test_select_feature(self): + """Test feature selection""" + feature_combo = self.widget.controls.feature + self.send_signal(self.widget.Inputs.data, self.data_cont) + time.sleep(0.1) + self.process_events() + self.assertEqual(self.widget.vizrank.rank_model.rowCount(), 6) + self.assertListEqual(["petal length", "petal width"], + [a.name for a in self.get_output( + self.widget.Outputs.features)]) + + simulate.combobox_activate_index(feature_combo, 1) + time.sleep(0.1) + self.process_events() + self.assertEqual(self.widget.vizrank.rank_model.rowCount(), 3) + self.assertListEqual(["petal length", "sepal length"], + [a.name for a in self.get_output( + self.widget.Outputs.features)]) + + simulate.combobox_activate_index(feature_combo, 0) + time.sleep(0.1) + self.process_events() + self.assertEqual(self.widget.vizrank.rank_model.rowCount(), 6) + self.assertListEqual(["petal length", "sepal length"], + [a.name for a in self.get_output( + self.widget.Outputs.features)]) + @patch("Orange.widgets.data.owcorrelations.SIZE_LIMIT", 2000) @patch("Orange.widgets.data.owcorrelations." "KMeansCorrelationHeuristic.n_clusters", 2) @@ -154,7 +223,20 @@ def test_vizrank_use_heuristic(self): self.send_signal(self.widget.Inputs.data, self.data_cont) time.sleep(0.1) self.process_events() - self.widget.commit() + self.assertEqual(self.widget.vizrank.rank_model.rowCount(), + len(self.widget.cont_data.domain.attributes) - 1) + + @patch("Orange.widgets.data.owcorrelations.SIZE_LIMIT", 2000) + @patch("Orange.widgets.data.owcorrelations." + "KMeansCorrelationHeuristic.n_clusters", 1) + def test_select_feature_against_heuristic(self): + """Never use heuristic if feature is selected""" + feature_combo = self.widget.controls.feature + self.send_signal(self.widget.Inputs.data, self.data_cont) + simulate.combobox_activate_index(feature_combo, 2) + time.sleep(0.1) + self.process_events() + self.assertEqual(self.widget.vizrank.rank_model.rowCount(), 3) def test_send_report(self): """Test report """ @@ -162,3 +244,34 @@ def test_send_report(self): self.widget.report_button.click() self.send_signal(self.widget.Inputs.data, None) self.widget.report_button.click() + + +class TestCorrelationRank(WidgetTest): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.iris = Table("iris") + cls.attrs = cls.iris.domain.attributes + + def setUp(self): + self.vizrank = CorrelationRank(None) + self.vizrank.attrs = self.attrs + + def test_compute_score(self): + self.vizrank.master = Mock() + self.vizrank.master.cont_data = self.iris + self.vizrank.master.correlation_type = CorrelationType.PEARSON + npt.assert_almost_equal(self.vizrank.compute_score((1, 0)), + [-0.1094, -0.1094, 0.1828], 4) + + def test_row_for_state(self): + row = self.vizrank.row_for_state((-0.2, 0.2, 0.1), (1, 0)) + self.assertEqual(row[0].data(Qt.DisplayRole), "+0.200") + self.assertEqual(row[0].data(CorrelationRank.PValRole), 0.1) + self.assertEqual(row[1].data(Qt.DisplayRole), self.attrs[0].name) + self.assertEqual(row[2].data(Qt.DisplayRole), self.attrs[1].name) + + def test_iterate_states_by_feature(self): + self.vizrank.sel_feature_index = 2 + states = self.vizrank.iterate_states_by_feature() + self.assertListEqual([(2, 0), (2, 1), (2, 3)], list(states)) diff --git a/Orange/widgets/visualize/utils/__init__.py b/Orange/widgets/visualize/utils/__init__.py index d0a1275a66a..8fb02cd2619 100644 --- a/Orange/widgets/visualize/utils/__init__.py +++ b/Orange/widgets/visualize/utils/__init__.py @@ -75,6 +75,9 @@ class VizRankDialog(QDialog, ProgressBarMixin, WidgetMessagesMixin): captionTitle = "" + NEGATIVE_COLOR = QColor(70, 190, 250) + POSITIVE_COLOR = QColor(170, 242, 43) + processingStateChanged = Signal(int) progressBarValueChanged = Signal(float) messageActivated = Signal(Msg)