Skip to content

Commit ec278bd

Browse files
committed
Feature as Predictor: New widget
1 parent c2c1648 commit ec278bd

File tree

9 files changed

+1410
-15
lines changed

9 files changed

+1410
-15
lines changed
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import unittest
2+
from unittest.mock import patch
3+
4+
import numpy as np
5+
6+
from Orange.classification import ColumnLearner, ColumnClassifier
7+
from Orange.data import DiscreteVariable, ContinuousVariable, Domain, Table
8+
9+
10+
class ColumnTest(unittest.TestCase):
11+
@classmethod
12+
def setUpClass(cls):
13+
cls.domain = Domain([DiscreteVariable("d1", values=["a", "b"]),
14+
DiscreteVariable("d2", values=["c", "d"]),
15+
DiscreteVariable("d3", values=["d", "c"]),
16+
ContinuousVariable("c1"),
17+
ContinuousVariable("c2")
18+
],
19+
DiscreteVariable("cls", values=["c", "d"]),
20+
[DiscreteVariable("m1", values=["a", "b"]),
21+
DiscreteVariable("m2", values=["d"]),
22+
ContinuousVariable("c3")]
23+
)
24+
cls.data = Table.from_numpy(
25+
cls.domain,
26+
np.array([[0, 0, 0, 1, 0.5],
27+
[0, 1, 1, 0.25, -3],
28+
[1, 0, np.nan, np.nan, np.nan]]),
29+
np.array([0, 1, 1]),
30+
np.array([[0, 0, 2],
31+
[1, 0, 8],
32+
[np.nan, np.nan, 5]])
33+
)
34+
35+
@patch("Orange.classification.column.ColumnModel")
36+
def test_fit_storage(self, clsfr):
37+
learner = ColumnLearner(self.domain.class_var, self.domain["d2"])
38+
self.assertEqual(learner.name, "column 'd2'")
39+
learner.fit_storage(self.data)
40+
clsfr.assert_called_with(self.domain.class_var, self.domain["d2"], None, None)
41+
42+
learner = ColumnLearner(self.domain.class_var, self.domain["c3"])
43+
learner.fit_storage(self.data)
44+
clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], None, None)
45+
46+
learner = ColumnLearner(self.domain.class_var, self.domain["c3"], 42, 3.5)
47+
self.assertEqual(learner.name, "column 'c3'")
48+
learner.fit_storage(self.data)
49+
clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], 42, 3.5)
50+
51+
def test_classifier_init_checks(self):
52+
cls = ColumnClassifier(self.domain.class_var, self.domain["d2"])
53+
cls.name = "column 'd2'"
54+
55+
cls = ColumnClassifier(self.domain.class_var, self.domain["d3"])
56+
cls.name = "column 'd3'"
57+
58+
cls = ColumnClassifier(self.domain.class_var, self.domain["c3"])
59+
cls.name = "column 'c3'"
60+
61+
self.assertRaises(
62+
ValueError,
63+
ColumnClassifier,
64+
self.domain.class_var, self.domain["d1"])
65+
66+
self.assertRaises(
67+
ValueError,
68+
ColumnClassifier,
69+
DiscreteVariable("x", values=("a", "b", "c")), self.domain["c3"])
70+
71+
def test_check_prob_range(self):
72+
self.assertTrue(
73+
ColumnClassifier.valid_prob_range(np.array([0, 0.5, 1]))
74+
)
75+
self.assertTrue(
76+
ColumnClassifier.valid_prob_range(np.array([0, 0.5, np.nan]))
77+
)
78+
self.assertFalse(
79+
ColumnClassifier.valid_prob_range(np.array([0, 0.5, 1.5]))
80+
)
81+
self.assertFalse(
82+
ColumnClassifier.valid_prob_range(np.array([0, 0.5, -1]))
83+
)
84+
85+
def test_check_value_sets(self):
86+
d1, d2, d3, *_ = self.domain.attributes
87+
c = self.domain.class_var
88+
m2: DiscreteVariable = self.domain["m2"]
89+
self.assertFalse(ColumnClassifier.valid_value_sets(c, d1))
90+
self.assertTrue(ColumnClassifier.valid_value_sets(c, d2))
91+
self.assertTrue(ColumnClassifier.valid_value_sets(c, d3))
92+
self.assertTrue(ColumnClassifier.valid_value_sets(c, m2))
93+
self.assertFalse(ColumnClassifier.valid_value_sets(m2, c))
94+
95+
def test_predict_discrete(self):
96+
# Just copy
97+
model = ColumnClassifier(self.domain.class_var, self.domain["d2"])
98+
self.assertEqual(model.name, "column 'd2'")
99+
classes, probs = model(self.data, model.ValueProbs)
100+
np.testing.assert_equal(classes, [0, 1, 0])
101+
np.testing.assert_equal(probs, [[1, 0], [0, 1], [1, 0]])
102+
103+
# Values are not in the same order -> map
104+
model = ColumnClassifier(self.domain.class_var, self.domain["d3"])
105+
classes, probs = model(self.data, model.ValueProbs)
106+
np.testing.assert_equal(classes, [1, 0, np.nan])
107+
np.testing.assert_equal(probs, [[0, 1], [1, 0], [0.5, 0.5]])
108+
109+
# Not in the same order, and one is missing -> map
110+
model = ColumnClassifier(self.domain.class_var, self.domain["m2"])
111+
classes, probs = model(self.data, model.ValueProbs)
112+
np.testing.assert_equal(classes, [1, 1, np.nan])
113+
np.testing.assert_equal(probs, [[0, 1], [0, 1], [0.5, 0.5]])
114+
115+
# Non-binary class
116+
domain = Domain(
117+
self.domain.attributes,
118+
DiscreteVariable("cls", values=["a", "c", "b", "d", "e"]))
119+
data = Table.from_numpy(domain, self.data.X, self.data.Y)
120+
model = ColumnClassifier(domain.class_var, domain["d3"])
121+
classes, probs = model(data, model.ValueProbs)
122+
np.testing.assert_equal(classes, [3, 1, np.nan])
123+
np.testing.assert_almost_equal(
124+
probs,
125+
np.array([[0, 0, 0, 1, 0],
126+
[0, 1, 0, 0, 0],
127+
[0.2, 0.2, 0.2, 0.2, 0.2]]))
128+
129+
def test_predict_as_direct_probs(self):
130+
model = ColumnClassifier(self.domain.class_var, self.domain["c1"])
131+
self.assertEqual(model.name, "column 'c1'")
132+
classes, probs = model(self.data, model.ValueProbs)
133+
np.testing.assert_equal(classes, [1, 0, np.nan])
134+
np.testing.assert_equal(probs, [[0, 1], [0.75, 0.25], [0.5, 0.5]])
135+
136+
model = ColumnClassifier(self.domain.class_var, self.domain["c2"])
137+
self.assertRaises(ValueError, model, self.data)
138+
139+
model = ColumnClassifier(self.domain.class_var, self.domain["c3"])
140+
self.assertRaises(ValueError, model, self.data)
141+
142+
def test_predict_with_logistic(self):
143+
model = ColumnClassifier(
144+
self.domain.class_var, self.domain["c1"], 0.5, 3)
145+
classes, probs = model(self.data, model.ValueProbs)
146+
np.testing.assert_equal(classes, [1, 0, np.nan])
147+
np.testing.assert_almost_equal(
148+
probs[:, 1], [1 / (1 + np.exp(-3 * (1 - 0.5))),
149+
1 / (1 + np.exp(-3 * (0.25 - 0.5))),
150+
0.5])
151+
np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1])
152+
153+
model = ColumnClassifier(
154+
self.domain.class_var, self.domain["c2"], 0.5, 3)
155+
classes, probs = model(self.data, model.ValueProbs)
156+
np.testing.assert_equal(classes, [0, 0, np.nan])
157+
np.testing.assert_almost_equal(
158+
probs[:, 1], [1 / (1 + np.exp(-3 * (0.5 - 0.5))),
159+
1 / (1 + np.exp(-3 * (-3 - 0.5))),
160+
0.5])
161+
np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1])
162+
163+
164+
if __name__ == "__main__":
165+
unittest.main()

Orange/modelling/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .randomforest import *
1212
from .svm import *
1313
from .tree import *
14+
from .column import *
1415
try:
1516
from .catgb import *
1617
except ImportError:

Orange/modelling/column.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from typing import Optional
2+
3+
import numpy as np
4+
5+
from Orange.data import Variable, DiscreteVariable, Domain, Table
6+
from Orange.classification import LogisticRegressionLearner
7+
from Orange.regression import LinearRegressionLearner
8+
from Orange.modelling import Model, Learner
9+
10+
__all__ = ["ColumnLearner", "ColumnModel"]
11+
12+
13+
def _check_column_combinations(
14+
class_var: Variable,
15+
column: Variable,
16+
fit_regression: bool):
17+
if class_var.is_continuous:
18+
if not column.is_continuous:
19+
raise ValueError(
20+
"Regression can only be used with numeric variables")
21+
return
22+
23+
assert isinstance(class_var, DiscreteVariable) # remove type warnings
24+
if column.is_continuous:
25+
if len(class_var.values) != 2:
26+
raise ValueError(
27+
"Numeric columns can only be used with binary class variables")
28+
else:
29+
assert isinstance(column, DiscreteVariable)
30+
if not valid_value_sets(class_var, column):
31+
raise ValueError(
32+
"Column contains values that are not in class variable")
33+
if fit_regression and not column.is_continuous:
34+
raise ValueError(
35+
"Intercept and coefficient are only allowed for continuous "
36+
"variables")
37+
38+
39+
def valid_prob_range(values: np.ndarray):
40+
return np.nanmin(values) >= 0 and np.nanmax(values) <= 1
41+
42+
43+
def valid_value_sets(class_var: DiscreteVariable,
44+
column_var: DiscreteVariable):
45+
return set(column_var.values) <= set(class_var.values)
46+
47+
48+
class ColumnLearner(Learner):
49+
def __init__(self,
50+
class_var: Variable,
51+
column: Variable,
52+
fit_regression: bool = False):
53+
super().__init__()
54+
_check_column_combinations(class_var, column, fit_regression)
55+
self.class_var = class_var
56+
self.column = column
57+
self.fit_regression = fit_regression
58+
self.name = f"column '{column.name}'"
59+
60+
def __fit_coefficients(self, data: Table):
61+
# Use learners from Orange rather than directly calling
62+
# scikit-learn, so that we make sure we use the same parameters
63+
# and get the same result as we would if we used the widgets.
64+
data1 = data.transform(Domain([self.column], self.class_var))
65+
if self.class_var.is_discrete:
66+
model = LogisticRegressionLearner()(data1)
67+
return model.intercept[0], model.coefficients[0][0]
68+
else:
69+
model = LinearRegressionLearner()(data1)
70+
return model.intercept, model.coefficients[0]
71+
72+
def fit_storage(self, data: Table):
73+
if data.domain.class_var != self.class_var:
74+
raise ValueError("Class variable does not match the data")
75+
if not self.fit_regression:
76+
return ColumnModel(self.class_var, self.column)
77+
78+
intercept, coefficient = self.__fit_coefficients(data)
79+
return ColumnModel(self.class_var, self.column, intercept, coefficient)
80+
81+
82+
class ColumnModel(Model):
83+
def __init__(self,
84+
class_var: Variable,
85+
column: Variable,
86+
intercept: Optional[float] = None,
87+
coefficient: Optional[float] = None):
88+
super().__init__(Domain([column], class_var))
89+
90+
_check_column_combinations(class_var, column, intercept is not None)
91+
if (intercept is not None) is not (coefficient is not None):
92+
raise ValueError(
93+
"Intercept and coefficient must both be provided or absent")
94+
95+
self.class_var = class_var
96+
self.column = column
97+
self.intercept = intercept
98+
self.coefficient = coefficient
99+
if (column.is_discrete and
100+
class_var.values[:len(column.values)] != column.values):
101+
self.value_mapping = np.array([class_var.to_val(x)
102+
for x in column.values])
103+
else:
104+
self.value_mapping = None
105+
106+
pars = f" ({intercept}, {coefficient})" if intercept is not None else ""
107+
self.name = f"column '{column.name}'{pars}"
108+
109+
def predict_storage(self, data: Table):
110+
vals = data.get_column(self.column)
111+
if self.class_var.is_discrete:
112+
return self._predict_discrete(vals)
113+
else:
114+
return self._predict_continuous(vals)
115+
116+
def _predict_discrete(self, vals):
117+
assert isinstance(self.class_var, DiscreteVariable)
118+
nclasses = len(self.class_var.values)
119+
proba = np.full((len(vals), nclasses), np.nan)
120+
rows = np.isfinite(vals)
121+
if self.column.is_discrete:
122+
mapped = vals[rows].astype(int)
123+
if self.value_mapping is not None:
124+
mapped = self.value_mapping[mapped]
125+
vals = vals.copy()
126+
vals[rows] = mapped
127+
proba[rows] = 0
128+
proba[rows, mapped] = 1
129+
else:
130+
if self.coefficient is None:
131+
if not valid_prob_range(vals):
132+
raise ValueError("Column values must be in [0, 1] range "
133+
"unless logistic function is applied")
134+
proba[rows, 1] = vals[rows]
135+
else:
136+
proba[rows, 1] = (
137+
1 /
138+
(1 + np.exp(-self.intercept - self.coefficient * vals[rows])
139+
))
140+
141+
proba[rows, 0] = 1 - proba[rows, 1]
142+
vals = (proba[:, 1] > 0.5).astype(float)
143+
vals[~rows] = np.nan
144+
return vals, proba
145+
146+
def _predict_continuous(self, vals):
147+
if self.coefficient is None:
148+
return vals
149+
else:
150+
return vals * self.coefficient + self.intercept
151+
152+
def __str__(self):
153+
pars = f" ({self.intercept}, {self.coefficient})" \
154+
if self.intercept is not None else ""
155+
return f'ColumnModel {self.column.name}{pars}'

0 commit comments

Comments
 (0)