Merge pull request #114 from x-tabdeveloping/npmi

x-tabdeveloping · web-flow · commit 942be6084064 · 2025-10-13T16:58:30.000+02:00
Added term importance method based on NPMI
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.11"]
           #
     # This allows a subsequently queued workflow run to interrupt previous runs
     concurrency:
diff --git a/docs/clustering.md b/docs/clustering.md
@@ -122,6 +122,7 @@ By and large there are two types of methods that can be used for importance esti
 | - | - | - | - |
 | `soft-c-tf-idf` *(default)* | Lexical | A c-tf-idf mehod that can interpret soft cluster assignments. | Can interpret soft cluster assignment in models like Gaussian Mixtures, less sensitive to stop words than vanilla c-tf-idf. |
 | `fighting-words` **(NEW)** | Lexical | Compute word importance based on cluster differences using the Fightin' Words algorithm by Monroe et al. | A theoretically motivated probabilistic model that was explicitly designed for discovering lexical differences in groups of text. See [Fightin' Words paper](https://languagelog.ldc.upenn.edu/myl/Monroe.pdf). |
+| `npmi` **(NEW)** | Lexical | Estimate term importance from mutual information between cluster labels and term occurrence. | Theoretically motivated, fast, and usually produces clean topics. |
 | `c-tf-idf` | Lexical | Compute how unique terms are in a cluster with a tf-idf style weighting scheme. This is the default in BERTopic. | Very fast, easy to understand and is not affected by cluster shape. |
 | `centroid` | Semantic | Word importance based on words' proximity to cluster centroid vectors. This is the default in Top2Vec. | Produces clean topics, easily interpretable. |
 | `linear` **(NEW, EXPERIMENTAL)** | Semantic | Project words onto the parameter vectors of a linear classifier (LDA). | Topic differences are measured in embedding space and are determined by predictive power, and are therefore accurate and clean. |
@@ -195,6 +196,14 @@ By and large there are two types of methods that can be used for importance esti
         model = ClusteringTopicModel(feature_importance="linear")
         ```
 
+    === "NPMI"
+
+        ```python
+        from turftopic import ClusteringTopicModel
+
+        model = ClusteringTopicModel(feature_importance="npmi")
+        ```
+        
 
 
 You can also choose to recalculate term importances with a different method after fitting the model:
diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py
@@ -207,37 +207,53 @@ def ctf_idf(
         return np.stack(components), idf_diag
 
 
-def bayes_rule(
-    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+def npmi(
+    doc_topic_matrix: np.ndarray,
+    doc_term_matrix: spr.csr_matrix,
+    smoothing: int = 5,
 ) -> np.ndarray:
-    """Computes feature importance based on Bayes' rule.
-    The importance of a word for a topic is the probability of the topic conditional on the word.
+    """Uses normalized pointwise mutual information between
+    clusters and terms to calculate term importance scores.
 
-    $$p(t|w) = \\frac{p(w|t) * p(t)}{p(w)}$$
+    To not underestimate individual words' occurrances (overfit),
+    a smoothing term is added, which is mathematically equivalent
+    to using the MAP estimate of a symmetric dirichlet-multinomial model.
 
     Parameters
     ----------
     doc_topic_matrix: np.ndarray
         Document-topic matrix of shape (n_documents, n_topics)
     doc_term_matrix: np.ndarray
         Document-term matrix of shape (n_documents, vocab_size)
+    smoothing: int, default 5
+        Alpha parameter of the symmetric Dirichlet-multinomial.
+        Corresponds to assuming that each term and cluster
+        occurred this many more times than the observed.
 
     Returns
     -------
     ndarray of shape (n_topics, vocab_size)
         Term importance matrix.
     """
     eps = np.finfo(float).eps
-    p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0)))
+    p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) + smoothing
     p_w = p_w / p_w.sum()
     p_w[p_w <= 0] = eps
-    p_t = doc_topic_matrix.sum(axis=0)
+    p_t = doc_topic_matrix.sum(axis=0) + smoothing
     p_t = p_t / p_t.sum()
-    term_importance = doc_topic_matrix.T @ doc_term_matrix
-    overall_in_topic = np.abs(term_importance).sum(axis=1)
-    overall_in_topic[overall_in_topic <= 0] = eps
-    p_wt = (term_importance.T / (overall_in_topic)).T
-    p_wt /= p_wt.sum(axis=1)[:, None]
-    p_tw = (p_wt.T * p_t).T / p_w
-    p_tw /= np.nansum(p_tw, axis=0)
-    return p_tw
+    labels = np.argmax(doc_topic_matrix, axis=1)
+    p_wt = []
+    for i in np.arange(doc_topic_matrix.shape[1]):
+        _p_w = (
+            np.squeeze(np.asarray(doc_term_matrix[labels == i].sum(axis=0)))
+            + smoothing
+        )
+        _p_w = _p_w / _p_w.sum()
+        _p_w[_p_w <= 0] = eps
+        p_wt.append(_p_w)
+    p_wt = np.stack(p_wt)
+    log_p_wt = np.log2(p_wt)
+    numerator = log_p_wt - np.log2(p_w)
+    denominator = -(log_p_wt.T - np.log2(p_t)).T
+    res = numerator / denominator
+    return res
diff --git a/turftopic/models/_hierarchical_clusters.py b/turftopic/models/_hierarchical_clusters.py
@@ -10,11 +10,11 @@
 
 from turftopic.base import ContextualModel
 from turftopic.feature_importance import (
-    bayes_rule,
     cluster_centroid_distance,
     ctf_idf,
     fighting_words,
     linear_classifier,
+    npmi,
     soft_ctf_idf,
 )
 from turftopic.hierarchical import TopicNode
@@ -221,8 +221,8 @@ def _estimate_children_components(self) -> dict[int, np.ndarray]:
                     self.model.embeddings,
                     self.model.vocab_embeddings,
                 )
-        elif self.model.feature_importance == "bayes":
-            components = bayes_rule(
+        elif self.model.feature_importance == "npmi":
+            components = npmi(
                 document_topic_matrix, self.model.doc_term_matrix
             )
         else:
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -23,11 +23,11 @@
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.encoders.multimodal import MultimodalEncoder
 from turftopic.feature_importance import (
-    bayes_rule,
     cluster_centroid_distance,
     ctf_idf,
     fighting_words,
     linear_classifier,
+    npmi,
     soft_ctf_idf,
 )
 from turftopic.models._hierarchical_clusters import (
@@ -64,7 +64,7 @@
     "soft-c-tf-idf",
     "c-tf-idf",
     "centroid",
-    "bayes",
+    "npmi",
     "linear",
     "fighting-words",
 ]
@@ -157,7 +157,7 @@ class ClusteringTopicModel(
         'c-tf-idf' uses BERTopic's c-tf-idf.
         'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
         be very similar to 'c-tf-idf'.
-        'bayes' uses Bayes' rule.
+        'npmi' uses normalized pointwise information between clusters and words.
         'linear' calculates most predictive directions in embedding space and projects
         words onto them.
         'fighting-words' calculates word importances based on the Fighting Words
@@ -293,7 +293,7 @@ def estimate_components(
             'c-tf-idf' uses BERTopic's c-tf-idf.
             'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
             be very similar to 'c-tf-idf'.
-            'bayes' uses Bayes' rule.
+            'npmi' uses normalized pointwise mutual information between clusters and words.
             'linear' calculates most predictive directions in embedding space and projects
             words onto them.
             'fighting-words' calculates word importances based on the Fighting Words
@@ -564,7 +564,7 @@ def estimate_temporal_components(
             'c-tf-idf' uses BERTopic's c-tf-idf.
             'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
             be very similar to 'c-tf-idf'.
-            'bayes' uses Bayes' rule.
+            'npmi' uses normalized pointwise information between clusters and words.
             'linear' calculates most predictive directions in embedding space and projects
             words onto them.
 
@@ -605,10 +605,8 @@ def estimate_temporal_components(
                 self.temporal_components_[i_timebin], _ = soft_ctf_idf(
                     t_doc_topic, t_dtm, return_idf=True
                 )
-            elif feature_importance == "bayes":
-                self.temporal_components_[i_timebin] = bayes_rule(
-                    t_doc_topic, t_dtm
-                )
+            elif feature_importance == "npmi":
+                self.temporal_components_[i_timebin] = npmi(t_doc_topic, t_dtm)
             elif feature_importance == "fighting-words":
                 self.temporal_components_[i_timebin] = fighting_words(
                     t_doc_topic, t_dtm

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ jobs:`
`10`	`10`	`runs-on: ubuntu-latest`
`11`	`11`	`strategy:`
`12`	`12`	`matrix:`
`13`		`- python-version: ["3.9"]`
	`13`	`+ python-version: ["3.11"]`
`14`	`14`	`#`
`15`	`15`	`# This allows a subsequently queued workflow run to interrupt previous runs`
`16`	`16`	`concurrency:`