Skip to content

Commit 942be60

Browse files
Merge pull request #114 from x-tabdeveloping/npmi
Added term importance method based on NPMI
2 parents 15919a0 + 19e99bf commit 942be60

File tree

5 files changed

+51
-28
lines changed

5 files changed

+51
-28
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
runs-on: ubuntu-latest
1111
strategy:
1212
matrix:
13-
python-version: ["3.9"]
13+
python-version: ["3.11"]
1414
#
1515
# This allows a subsequently queued workflow run to interrupt previous runs
1616
concurrency:

docs/clustering.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ By and large there are two types of methods that can be used for importance esti
122122
| - | - | - | - |
123123
| `soft-c-tf-idf` *(default)* | Lexical | A c-tf-idf mehod that can interpret soft cluster assignments. | Can interpret soft cluster assignment in models like Gaussian Mixtures, less sensitive to stop words than vanilla c-tf-idf. |
124124
| `fighting-words` **(NEW)** | Lexical | Compute word importance based on cluster differences using the Fightin' Words algorithm by Monroe et al. | A theoretically motivated probabilistic model that was explicitly designed for discovering lexical differences in groups of text. See [Fightin' Words paper](https://languagelog.ldc.upenn.edu/myl/Monroe.pdf). |
125+
| `npmi` **(NEW)** | Lexical | Estimate term importance from mutual information between cluster labels and term occurrence. | Theoretically motivated, fast, and usually produces clean topics. |
125126
| `c-tf-idf` | Lexical | Compute how unique terms are in a cluster with a tf-idf style weighting scheme. This is the default in BERTopic. | Very fast, easy to understand and is not affected by cluster shape. |
126127
| `centroid` | Semantic | Word importance based on words' proximity to cluster centroid vectors. This is the default in Top2Vec. | Produces clean topics, easily interpretable. |
127128
| `linear` **(NEW, EXPERIMENTAL)** | Semantic | Project words onto the parameter vectors of a linear classifier (LDA). | Topic differences are measured in embedding space and are determined by predictive power, and are therefore accurate and clean. |
@@ -195,6 +196,14 @@ By and large there are two types of methods that can be used for importance esti
195196
model = ClusteringTopicModel(feature_importance="linear")
196197
```
197198

199+
=== "NPMI"
200+
201+
```python
202+
from turftopic import ClusteringTopicModel
203+
204+
model = ClusteringTopicModel(feature_importance="npmi")
205+
```
206+
198207

199208

200209
You can also choose to recalculate term importances with a different method after fitting the model:

turftopic/feature_importance.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -207,37 +207,53 @@ def ctf_idf(
207207
return np.stack(components), idf_diag
208208

209209

210-
def bayes_rule(
211-
doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
210+
def npmi(
211+
doc_topic_matrix: np.ndarray,
212+
doc_term_matrix: spr.csr_matrix,
213+
smoothing: int = 5,
212214
) -> np.ndarray:
213-
"""Computes feature importance based on Bayes' rule.
214-
The importance of a word for a topic is the probability of the topic conditional on the word.
215+
"""Uses normalized pointwise mutual information between
216+
clusters and terms to calculate term importance scores.
215217
216-
$$p(t|w) = \\frac{p(w|t) * p(t)}{p(w)}$$
218+
To not underestimate individual words' occurrances (overfit),
219+
a smoothing term is added, which is mathematically equivalent
220+
to using the MAP estimate of a symmetric dirichlet-multinomial model.
217221
218222
Parameters
219223
----------
220224
doc_topic_matrix: np.ndarray
221225
Document-topic matrix of shape (n_documents, n_topics)
222226
doc_term_matrix: np.ndarray
223227
Document-term matrix of shape (n_documents, vocab_size)
228+
smoothing: int, default 5
229+
Alpha parameter of the symmetric Dirichlet-multinomial.
230+
Corresponds to assuming that each term and cluster
231+
occurred this many more times than the observed.
224232
225233
Returns
226234
-------
227235
ndarray of shape (n_topics, vocab_size)
228236
Term importance matrix.
229237
"""
230238
eps = np.finfo(float).eps
231-
p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0)))
239+
p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) + smoothing
232240
p_w = p_w / p_w.sum()
233241
p_w[p_w <= 0] = eps
234-
p_t = doc_topic_matrix.sum(axis=0)
242+
p_t = doc_topic_matrix.sum(axis=0) + smoothing
235243
p_t = p_t / p_t.sum()
236-
term_importance = doc_topic_matrix.T @ doc_term_matrix
237-
overall_in_topic = np.abs(term_importance).sum(axis=1)
238-
overall_in_topic[overall_in_topic <= 0] = eps
239-
p_wt = (term_importance.T / (overall_in_topic)).T
240-
p_wt /= p_wt.sum(axis=1)[:, None]
241-
p_tw = (p_wt.T * p_t).T / p_w
242-
p_tw /= np.nansum(p_tw, axis=0)
243-
return p_tw
244+
labels = np.argmax(doc_topic_matrix, axis=1)
245+
p_wt = []
246+
for i in np.arange(doc_topic_matrix.shape[1]):
247+
_p_w = (
248+
np.squeeze(np.asarray(doc_term_matrix[labels == i].sum(axis=0)))
249+
+ smoothing
250+
)
251+
_p_w = _p_w / _p_w.sum()
252+
_p_w[_p_w <= 0] = eps
253+
p_wt.append(_p_w)
254+
p_wt = np.stack(p_wt)
255+
log_p_wt = np.log2(p_wt)
256+
numerator = log_p_wt - np.log2(p_w)
257+
denominator = -(log_p_wt.T - np.log2(p_t)).T
258+
res = numerator / denominator
259+
return res

turftopic/models/_hierarchical_clusters.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@
1010

1111
from turftopic.base import ContextualModel
1212
from turftopic.feature_importance import (
13-
bayes_rule,
1413
cluster_centroid_distance,
1514
ctf_idf,
1615
fighting_words,
1716
linear_classifier,
17+
npmi,
1818
soft_ctf_idf,
1919
)
2020
from turftopic.hierarchical import TopicNode
@@ -221,8 +221,8 @@ def _estimate_children_components(self) -> dict[int, np.ndarray]:
221221
self.model.embeddings,
222222
self.model.vocab_embeddings,
223223
)
224-
elif self.model.feature_importance == "bayes":
225-
components = bayes_rule(
224+
elif self.model.feature_importance == "npmi":
225+
components = npmi(
226226
document_topic_matrix, self.model.doc_term_matrix
227227
)
228228
else:

turftopic/models/cluster.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
from turftopic.dynamic import DynamicTopicModel
2424
from turftopic.encoders.multimodal import MultimodalEncoder
2525
from turftopic.feature_importance import (
26-
bayes_rule,
2726
cluster_centroid_distance,
2827
ctf_idf,
2928
fighting_words,
3029
linear_classifier,
30+
npmi,
3131
soft_ctf_idf,
3232
)
3333
from turftopic.models._hierarchical_clusters import (
@@ -64,7 +64,7 @@
6464
"soft-c-tf-idf",
6565
"c-tf-idf",
6666
"centroid",
67-
"bayes",
67+
"npmi",
6868
"linear",
6969
"fighting-words",
7070
]
@@ -157,7 +157,7 @@ class ClusteringTopicModel(
157157
'c-tf-idf' uses BERTopic's c-tf-idf.
158158
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
159159
be very similar to 'c-tf-idf'.
160-
'bayes' uses Bayes' rule.
160+
'npmi' uses normalized pointwise information between clusters and words.
161161
'linear' calculates most predictive directions in embedding space and projects
162162
words onto them.
163163
'fighting-words' calculates word importances based on the Fighting Words
@@ -293,7 +293,7 @@ def estimate_components(
293293
'c-tf-idf' uses BERTopic's c-tf-idf.
294294
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
295295
be very similar to 'c-tf-idf'.
296-
'bayes' uses Bayes' rule.
296+
'npmi' uses normalized pointwise mutual information between clusters and words.
297297
'linear' calculates most predictive directions in embedding space and projects
298298
words onto them.
299299
'fighting-words' calculates word importances based on the Fighting Words
@@ -564,7 +564,7 @@ def estimate_temporal_components(
564564
'c-tf-idf' uses BERTopic's c-tf-idf.
565565
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
566566
be very similar to 'c-tf-idf'.
567-
'bayes' uses Bayes' rule.
567+
'npmi' uses normalized pointwise information between clusters and words.
568568
'linear' calculates most predictive directions in embedding space and projects
569569
words onto them.
570570
@@ -605,10 +605,8 @@ def estimate_temporal_components(
605605
self.temporal_components_[i_timebin], _ = soft_ctf_idf(
606606
t_doc_topic, t_dtm, return_idf=True
607607
)
608-
elif feature_importance == "bayes":
609-
self.temporal_components_[i_timebin] = bayes_rule(
610-
t_doc_topic, t_dtm
611-
)
608+
elif feature_importance == "npmi":
609+
self.temporal_components_[i_timebin] = npmi(t_doc_topic, t_dtm)
612610
elif feature_importance == "fighting-words":
613611
self.temporal_components_[i_timebin] = fighting_words(
614612
t_doc_topic, t_dtm

0 commit comments

Comments
 (0)