added tests

cemoody · cemoody · commit 6848a8ed5df2 · 2015-12-24T10:33:11.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 *.pyo
 *.cpp
 *.so
+*.swp
 build
 \#*\#
 .\#*
diff --git a/lda2vec/embed_factor.py b/lda2vec/embed_factor.py
@@ -19,16 +19,24 @@ class EmbedMixture(chainer.Chain):
     uninterpretable until you measure the words most similar to this topic
     vector.
 
+    :math:`e=\Sigma_{j=0}^{j=n\_topics} c_j \cdot \vec{T_j}`
+
+    This is usually paired with regularization on the weights `c_j`. If using
+    a Dirichlet prior with low alpha, these weights will be sparse.
+
     Args:
         n_documents (int): Total number of documents
         n_topics (int): Number of topics per document
         n_dim (int): Number of dimensions per topic vector (should match word
             vector size)
 
     Attributes:
-        weights (~chainer.links.EmbedID): Unnormalized topic weights. To
-            normalize these weights, use `F.softmax(weights)`.
-        factors (~chainer.links.Parameter): Topic vector matrix.
+        weights (~chainer.links.EmbedID): Unnormalized topic weights
+            (:math:`c_j`). To normalize these weights, use
+            `F.softmax(weights)`.
+        factors (~chainer.links.Parameter): Topic vector matrix (:math:`T_j`)
+
+    .. seealso:: :func:`lda2vec.dirichlet_likelihood`
     """
 
     def __init__(self, n_documents, n_topics, n_dim):
@@ -52,7 +60,16 @@ def to_cpu(self):
         super(EmbedMixture, self).to_cpu()
 
     def __call__(self, doc_ids):
-        """
+        """ Given an array of document integer indices, returns a vector
+        for each document. The vector is composed of topic weights projected
+        onto topic vectors.
+
+        Args:
+            doc_ids (~chainer.Variable): One-dimensional batch vectors of IDs
+
+        Returns:
+            ~chainer.Variable: Batch of two-dimensional embeddings for every
+                document.
         """
         # (batchsize, ) --> (batchsize, logweights)
         w = self.weights(doc_ids)
diff --git a/tests/test_dirichlet_likelihood.py b/tests/test_dirichlet_likelihood.py
@@ -0,0 +1,27 @@
+import numpy as np
+from chainer import Variable
+
+from lda2vec import dirichlet_likelihood
+
+
+def test_concentration():
+    """ Test that alpha > 1.0 on a dense vector has a higher likelihood
+    than alpha < 1.0 on a dense vector, and test that a sparse vector
+    has the opposite character. """
+
+    dense = np.abs(np.random.randn(5, 10, dtype='float32'))
+    dense /= dense.max(axis=0)
+    weights = Variable(dense)
+    dhl_likely = dirichlet_likelihood(weights, alpha=10.0)
+    dhl_unlikely = dirichlet_likelihood(weights, alpha=0.1)
+
+    assert dhl_likely > dhl_unlikely
+
+    sparse = np.abs(np.random.randn(5, 10, dtype='float32'))
+    sparse[1:, :] = 0.0
+    sparse /= sparse.max(axis=0)
+    weights = Variable(sparse)
+    dhl_unlikely = dirichlet_likelihood(weights, alpha=10.0)
+    dhl_likely = dirichlet_likelihood(weights, alpha=0.1)
+
+    assert dhl_likely > dhl_unlikely
diff --git a/tests/test_embed_factor.py b/tests/test_embed_factor.py
@@ -0,0 +1,21 @@
+import numpy as np
+from chainer import Variable
+
+from lda2vec import EmbedMixture
+
+
+def softmax(v):
+    return np.exp(v) / np.sum(np.exp(v))
+
+
+def test_embed_mixture():
+    """ Manually test """
+    # Ten documents, two topics, five hidden dimensions
+    em = EmbedMixture(10, 2, 5)
+    doc_ids = Variable(np.arange(1, dtype='int32'))
+    doc_vector = em(doc_ids)
+    # weights -- (n_topics)
+    weights, = softmax(em.weights.data[0, :])
+    # (n_hidden) = (n_topics) . (n_topics, n_hidden)
+    doc_vector_test = weights * em.factors.data
+    assert np.all_close(doc_vector, doc_vector_test)

-Original file line number
+Diff line change
 *.pyo
 *.cpp
 *.so
 +*.swp
 build
 \#*\#
 .\#*