-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathchimerge.py
234 lines (199 loc) · 10.8 KB
/
chimerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
__author__ = 'lisette.espin'
######################################################################################################################
# SYSTEM DEPENDENCES
######################################################################################################################
import numpy as np
import math
######################################################################################################################
# LOCAL DEPENDENCES
######################################################################################################################
import utils
######################################################################################################################
# CONSTANTS
# Threshold reference: (http://sites.stat.psu.edu/~mga/401/tables/Chi-square-table.pdf)
######################################################################################################################
MIN_NUMBER_INTERVALS = 2
######################################################################################################################
# ChiMerge CLASS
######################################################################################################################
class ChiMerge():
'''
1992 by R. Kerber
Reference: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf
'''
def __init__(self, min_expected_value, max_number_intervals, threshold):
'''
chi-square distribution table: http://sites.stat.psu.edu/~mga/401/tables/Chi-square-table.pdf
:param min_expected_value:
:param max_number_intervals:
:param threshold:
:return:
'''
self.data = None
self.sorted_data = None
self.frequency_matrix = None
self.frequency_matrix_intervals = None
self.nclasses = -1
self.nattrinutes = -1
self.degrees_freedom = -1
self.min_expected_value = min_expected_value
self.min_number_intervals = MIN_NUMBER_INTERVALS
self.max_number_intervals = max_number_intervals
self.threshold = threshold
def loadData(self, data, issorted=False):
'''
:param data: numpy matrix
:param issorted: boolean, if data is already sorted, no need to sort again (based on attribute_column)
:return:
'''
if type(data) != np.matrix and type(data) != np.array:
utils.printf('ERROR: data must be a numpy.matrix or numpy.array')
return
self.data = data # numpy.matrix (x,2). column index 0 refers to attributes column and index 1 classes
if not issorted:
self.sorted_data = np.array(np.sort(data.view('i8,i8'), order=['f0'], axis=0).view(np.float)) #always sorting column 0 (attribute column)
else:
self.sorted_data = np.array(data)
utils.printf('Sorted data: matrix {}x{}'.format(self.sorted_data.shape[0],self.sorted_data.shape[1]))
def loadFrequencyMatrix(self, frequency_matrix, unique_attribute_values):
'''
:param frequency_matrix: numpy array
:return: void
'''
if type(frequency_matrix) != np.array:
utils.printf('ERROR: data must be a numpy.array')
return
self.frequency_matrix = frequency_matrix
self.frequency_matrix_intervals = unique_attribute_values
self.nclasses = self.frequency_matrix.shape[1]
self.degrees_freedom = self.nclasses - 1
self.printInitialSummary()
def generateFrequencyMatrix(self):
if self.sorted_data is None:
utils.printf('ERROR: Your (sorted) data should be loaded!')
return
if self.sorted_data.shape[1] != 2:
utils.printf('ERROR: Your (sorted) matrix should have 2 columns only (attribute, class)')
return
unique_attribute_values, indices = np.unique(self.sorted_data[:,0], return_inverse=True) # first intervals: unique attribute values
unique_class_values = np.unique(self.sorted_data[:,1]) # classes (column index 1)
self.frequency_matrix = np.zeros((len(unique_attribute_values), len(unique_class_values))) # init frequency_matrix
self.frequency_matrix_intervals = unique_attribute_values # init intervals (unique attribute values)
self.nclasses = len(unique_class_values) # number of classes
self.degrees_freedom = self.nclasses - 1 # degress of freedom (look at table)
# Generating first frequency values (contingency table), number of instances found in data: attribute-class
for row in np.unique(indices):
for col, clase in enumerate(unique_class_values):
self.frequency_matrix[row,col] += np.where(self.sorted_data[np.where(indices == row)][:,1] == clase)[0].shape[0]
self.printInitialSummary()
def chisqrtest(self, array):
'''
:param array: np.array 2 consecutive rows from frequeny attribute/class matrix, e.g.,: a = np.matrix('16 0 0; 4 1 1')
:return chisqr value of distribution of 2 rows
'''
shape = array.shape
N = float(array.sum()) # total number of observations
r = self._getTotalsPerRow(array)
c = self._getTotalsPerColumn(array)
chisqr = 0
for row in range(shape[0]):
for col in range(shape[1]):
e = r[row]*c[col] / N # expected value
o = array[row,col] # observed value
e = self.min_expected_value if e < self.min_expected_value else e
chisqr += 0. if e == 0. else math.pow((o - e),2) / float(e)
return chisqr
def chimerge(self):
if self.frequency_matrix is None:
utils.printf('ERROR: Your frequency matrix should be loaded!')
return
chitest = {}
counter = 0
smallest = -1
while self._too_many_intervals():
###
# CHI2 TEST
###
chitest = {}
shape = self.frequency_matrix.shape
for r in range(shape[0] - 1):
interval = r,r+1
chi2 = self.chisqrtest(self.frequency_matrix[[interval],:][0])
if chi2 not in chitest:
chitest[chi2] = []
chitest[chi2].append( (interval) )
smallest = min(chitest.keys())
biggest = max(chitest.keys())
###
# SUMMARY
###
counter += 1
utils.printf('')
utils.printf('ROUND {}: {} intervals. Chi min:{}, Chi max:{}'.format(counter, self.frequency_matrix.shape[0], smallest, biggest))
utils.printf('CHI2 VALUES: {}'.format(chitest.keys()))
###
# MERGE
###
if self._more_merges(smallest):
utils.printf('MERGING INTERVALS: chi {} -> {}'.format(smallest, chitest[smallest]))
for (lower,upper) in list(reversed(chitest[smallest])): # reversed, to be able to remove rows on the fly
for col in range(shape[1]): # checking columns (to append values from row i+1 ---to be removed--- to row i)
self.frequency_matrix[lower,col] += self.frequency_matrix[upper,col] # appending frequencies to the remaining interval
self.frequency_matrix = np.delete(self.frequency_matrix, upper, 0) # removing interval (because we merged it in the previous step)
self.frequency_matrix_intervals = np.delete(self.frequency_matrix_intervals, upper, 0) # also removing the corresponding interval (real values)
utils.printf('NEW INTERVALS: ({}):{}'.format(len(self.frequency_matrix_intervals),self.frequency_matrix_intervals))
else:
break
self.chitestvalues = chitest
utils.printf('END (chi {} > {})\n'.format(smallest, self.threshold))
##############################################################
# Printing (output)
##############################################################
def printInitialSummary(self):
utils.printf('')
utils.printf('ROUND 0: Initial values:')
utils.printf('- Number of classes: {}'.format(self.nclasses))
utils.printf('- Degrees of Freedom: {} (deprecated)'.format(self.degrees_freedom))
utils.printf('- Threshold: {}'.format(self.threshold))
utils.printf('- Max number of intervals: {}'.format(self.max_number_intervals))
utils.printf('- Number of (unique) intervals: {}'.format(len(self.frequency_matrix_intervals)))
utils.printf('- Frequency matrix: {}x{} (sum {})'.format(self.frequency_matrix.shape[0], self.frequency_matrix.shape[1], self.frequency_matrix.sum()))
utils.printf('- Intervals: {}'.format(self.frequency_matrix_intervals))
def printFinalSummary(self):
utils.printf('FINAL SUMMARY')
utils.printf('{}{}'.format('Intervals: ',self.frequency_matrix_intervals))
utils.printf('{}{}'.format('Chi2: ',', '.join(['[{}-{}):{:5.1f}'.format(v[0][0],v[0][1],k) for k,v in utils.sortDictByValue(self.chitestvalues,False)])))
utils.printf('{} ({}x{})\n{}'.format('Interval-Class Frequencies',self.frequency_matrix.shape[0],self.frequency_matrix.shape[1],self.frequency_matrix))
##############################################################
# Handlers
##############################################################
def _continue(self, smallest):
c1 = self._too_many_intervals()
c2 = self._more_merges(smallest)
utils.printf('- Too many intervals?: {}'.format(c1))
utils.printf('- Can we merge more?: {}'.format(c2))
return c1 or c2
def _more_merges(self, smallest):
return smallest <= self.threshold #sum([chi2 > self.threshold for chi2 in chi2keys]) == len(chi2keys)
def _too_many_intervals(self):
return self.frequency_matrix.shape[0] > self.max_number_intervals
def _getTotalsPerRow(self, narray):
'''
:param narray: numpy.array 2 consecutive rows from frequeny attribute/class
:return: dictionary with total number of observations per i_th row
'''
shape = narray.shape
r = {}
for i in range(shape[0]):
r[i] = narray[i].sum()
return r
def _getTotalsPerColumn(self, narray):
'''
:param narray: numpy.array 2 consecutive rows from frequeny attribute/class matrix
:return: dictionary with total number of observations per j_th column
'''
shape = narray.shape
c = {}
for j in range(shape[1]):
c[j] = narray[:,j].sum()
return c