forked from SleepyBag/Statistical-Learning-Methods
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMaxEntropy.py
164 lines (148 loc) · 5.61 KB
/
MaxEntropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from rich.console import Console
from rich.table import Table
import numpy as np
from numpy import linalg
import sys
import os
from pathlib import Path
sys.path.append(str(Path(os.path.abspath(__file__)).parent.parent))
from utils import softmax, line_search
class MaxEntropy:
def __init__(self, epsilon=1e-6, max_steps=1000, verbose=True):
self.epsilon = epsilon
self.max_steps = max_steps
self.verbose = verbose
def _p_w(self, w):
"""
calculate probability table according to w
"""
logit = (w[:, None, None] * self.feature).sum(axis=0)
p_w = softmax(logit, axis=-1)
return p_w
def _f(self, w):
"""
object function
"""
return \
(
np.log(
np.exp(
(w[:, None, None] * self.feature
).sum(axis=0)
).sum(axis=-1)) * self.p_data_x
).sum() - \
(
self.p_data * (w[:, None, None] * self.feature).sum(axis=0)
).sum()
def _g(self, w):
"""
gradient of object function
"""
p_w = self._p_w(w)
return (self.p_data_x[None, :, None] * p_w[None, :, :] * self.feature
).sum(axis=(1, 2)) - self.E_feature
def fit(self, p_data, feature):
"""
optimize max entropy model with BFGS
p_data: matrix of shape [nx, ny], possibility of all (x, y)
feature: matrix of shape[nf, nx, ny], all the feature functions of all (x, y)
"""
# nf is the number of feature functions, and the size of w
self.nf, self.nx, self.ny = feature.shape
self.feature = feature
self.p_data = p_data
self.p_data_x = p_data.sum(axis=-1)
self.E_feature = (p_data[None, :, :] * feature).sum(axis=(1, 2))
# initlaize optimizer
self.w = np.random.rand(self.nf)
B = np.eye(self.nf)
g_next = self._g(self.w)
g_norm = linalg.norm(g_next)
# optimize
for i in range(self.max_steps):
g = g_next
if self.verbose:
print(f"Step {i}, L2 norm of gradient is {g_norm}")
if g_norm < self.epsilon:
break
p = linalg.solve(B, -g)
f_lambda = lambda x: self._f(self.w + x * p)
lamda = line_search(f_lambda, 0, 100, epsilon=self.epsilon)
delta_w = lamda * p
self.w += delta_w
g_next = self._g(self.w)
g_norm = linalg.norm(g_next)
if g_norm < self.epsilon:
print(f"L2 norm of gradient is {g_norm}, stop training...")
break
delta_g = g_next - g
B_delta_w = B @ delta_w
B += np.outer(delta_g, delta_g) / (delta_g @ delta_w) - \
np.outer(B_delta_w, B_delta_w) / (B_delta_w.T @ delta_w)
self.p_w = self._p_w(self.w)
def predict(self, x, y):
"""predict p(y|x)"""
return self.p_w[x][y]
# The following examples are proposed by SleepyBag at
# https://www.zhihu.com/question/24094554/answer/1507080982
if __name__ == "__main__":
console = Console(markup=False)
def float2str(x):
return "%.3f" % x
def demonstrate(data, feature_functions):
max_entropy = MaxEntropy()
max_entropy.fit(data, feature_functions)
# print results
for i, ff in enumerate(feature_functions):
table = Table(f'feature {i}', 'y=1', 'y=2', 'y=3')
for x in range(2):
table.add_row(f'x={x}', *map(float2str, [ff[x, y] for y in range(3)]))
console.print(table)
table = Table('prob', 'y=1', 'y=2', 'y=3')
for x in range(2):
table.add_row(f'x={x}', *map(float2str, [max_entropy.predict(x, y) for y in range(3)]))
console.print(table)
# ---------------------- Prepare Data -----------------------------------------
data = np.array([[.125, .25, .125],
[.5, 0., 0.]])
table = Table('data', 'y=1', 'y=2', 'y=3')
for x in range(2):
table.add_row(f'x={x}', *map(float2str, [data[x, y] for y in range(3)]))
console.print(table)
# ---------------------- Example 1---------------------------------------------
print('Example 1: Single feature function')
feature_functions = np.array([
[[1, 0, 0],
[0, 0, 0]]
])
demonstrate(data, feature_functions)
# ---------------------- Example 3---------------------------------------------
print('Example 2: the value of feature function doesn\'t matter for feature function with only one non-zero value')
feature_functions = np.array([
[[0.5, 0, 0],
[0, 0, 0]]
])
demonstrate(data, feature_functions)
# ---------------------- Example 2---------------------------------------------
print('Example 3: double feature functions')
feature_functions = np.array([
[[1, 0, 0],
[0, 0, 0]],
[[0, 0, 0],
[0, 1, 0]]
])
demonstrate(data, feature_functions)
# ---------------------- Example 3---------------------------------------------
print('Example 4: single feature function with two non-zeros')
feature_functions = np.array([
[[0, 1, 1],
[0, 0, 0]]
])
demonstrate(data, feature_functions)
# ---------------------- Example 3---------------------------------------------
print('Example 5: the value of feature function matters for feature function with multiple non-zero values')
feature_functions = np.array([
[[0, 1, .5],
[0, 0, 0]]
])
demonstrate(data, feature_functions)