From 831cfcb8e5b718a4e40876698ba6ce523b90ebb2 Mon Sep 17 00:00:00 2001 From: Li Date: Mon, 21 May 2018 14:53:56 +0800 Subject: [PATCH 1/2] Fix logic error in topic_modelling_gensim notebook --- .../topic_modeling_Gensim-checkpoint.ipynb | 602 ++++++++++++++++++ topic_modeling_Gensim.ipynb | 6 +- 2 files changed, 605 insertions(+), 3 deletions(-) create mode 100644 .ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb diff --git a/.ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb b/.ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb new file mode 100644 index 0000000..44bf54b --- /dev/null +++ b/.ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb @@ -0,0 +1,602 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "spacy.load('en')\n", + "from spacy.lang.en import English\n", + "parser = English()\n", + "\n", + "def tokenize(text):\n", + " lda_tokens = []\n", + " tokens = parser(text)\n", + " for token in tokens:\n", + " if token.orth_.isspace():\n", + " continue\n", + " elif token.like_url:\n", + " lda_tokens.append('URL')\n", + " elif token.orth_.startswith('@'):\n", + " lda_tokens.append('SCREEN_NAME')\n", + " else:\n", + " lda_tokens.append(token.lower_)\n", + " return lda_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /Users/sli/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "nltk.download('wordnet')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import wordnet as wn\n", + "def get_lemma(word):\n", + " lemma = wn.morphy(word)\n", + " if lemma is None:\n", + " return word\n", + " else:\n", + " return lemma\n", + " \n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "def get_lemma2(word):\n", + " return WordNetLemmatizer().lemmatize(word)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dogs dog dog\n", + "ran run ran\n", + "discouraged discourage discouraged\n" + ] + } + ], + "source": [ + "for w in ['dogs', 'ran', 'discouraged']:\n", + " print(w, get_lemma(w), get_lemma2(w))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /Users/sli/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "nltk.download('stopwords')\n", + "en_stop = set(nltk.corpus.stopwords.words('english'))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_text_for_lda(text):\n", + " tokens = tokenize(text)\n", + " tokens = [token for token in tokens if len(token) > 4]\n", + " tokens = [token for token in tokens if token not in en_stop]\n", + " tokens = [get_lemma(token) for token in tokens]\n", + " return tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['rigorous', 'integration', 'piece', 'linear', 'continuous', 'system']\n", + "['shape', 'transformation', 'polyhedral', 'object']\n", + "['search', 'challenge', 'opportunity']\n", + "['programming']\n", + "['efficient', 'resource', 'allocation', 'flexible', 'channel', 'cooperation', 'ofdma', 'cognitive', 'radio', 'network']\n", + "['connectivity', 'large', 'scale', 'cognitive', 'radio', 'network']\n", + "['hardware', 'accelerate', 'shaders', 'using']\n", + "['domain', 'circuit', 'model', 'chemical', 'reaction']\n", + "['bin', 'algorithm', 'accurate', 'computer', 'aid', 'device', 'modeling']\n", + "['output', 'perturbation', 'query', 'relaxation']\n", + "['parallel', 'processor', 'architecture', 'graphics', 'arithmetic', 'operations']\n", + "['efficient', 'management', 'multiversion', 'document', 'object', 'reference']\n", + "['exact', 'regenerate', 'code', 'byzantine', 'fault', 'tolerance', 'distribute', 'storage']\n", + "['offset', 'cancellation', 'crossing', 'base', 'circuit']\n", + "['image', 'sensor', 'spike', 'pixel', 'retinal', 'stimulation']\n", + "['hardness', 'approximation', 'survivable', 'multi', 'level', 'problem']\n", + "['issue', 'distribute', 'database', 'management', 'system', 'technical', 'overview']\n", + "['uniform', 'recursive', 'subdivision', 'surface']\n", + "['stack', 'memory', 'design', 'instruction', 'folding', 'processor']\n", + "['framework', 'optimal', 'battery', 'management', 'wireless', 'node']\n", + "['selection', 'reduce', 'encoding', 'complexity', 'h.264/avc']\n", + "['speed', 'class', 'current', 'circuit']\n", + "['design', 'gallery', 'general', 'approach', 'setting', 'parameter', 'computer', 'graphics', 'animation']\n", + "['simple', 'realistic', 'generation']\n", + "['toward', 'practical', 'constraint', 'database']\n" + ] + } + ], + "source": [ + "import random\n", + "text_data = []\n", + "with open('dataset.csv') as f:\n", + " for line in f:\n", + " tokens = prepare_text_for_lda(line)\n", + " text_data.append(tokens)\n", + " if random.random() > .99:\n", + " print(tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim import corpora\n", + "dictionary = corpora.Dictionary(text_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [dictionary.doc2bow(text) for text in text_data]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "pickle.dump(corpus, open('corpus.pkl', 'wb'))\n", + "dictionary.save('dictionary.gensim')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try 5 topics" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "NUM_TOPICS = 5\n", + "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)\n", + "ldamodel.save('model5.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.034*\"processor\" + 0.019*\"database\" + 0.019*\"issue\" + 0.019*\"overview\"')\n", + "(1, '0.051*\"computer\" + 0.028*\"design\" + 0.028*\"graphics\" + 0.028*\"gallery\"')\n", + "(2, '0.050*\"management\" + 0.027*\"object\" + 0.027*\"circuit\" + 0.027*\"efficient\"')\n", + "(3, '0.019*\"cognitive\" + 0.019*\"radio\" + 0.019*\"network\" + 0.019*\"distribute\"')\n", + "(4, '0.029*\"circuit\" + 0.029*\"system\" + 0.029*\"rigorous\" + 0.029*\"integration\"')\n" + ] + } + ], + "source": [ + "topics = ldamodel.print_topics(num_words=4)\n", + "for topic in topics:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(38, 1), (117, 1)]\n", + "[(0, 0.06669136), (1, 0.40170625), (2, 0.06670282), (3, 0.39819494), (4, 0.066704586)]\n" + ] + } + ], + "source": [ + "new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'\n", + "new_doc = prepare_text_for_lda(new_doc)\n", + "new_doc_bow = dictionary.doc2bow(new_doc)\n", + "print(new_doc_bow)\n", + "print(ldamodel.get_document_topics(new_doc_bow))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.029*\"processor\" + 0.016*\"management\" + 0.016*\"aid\" + 0.016*\"algorithm\"')\n", + "(1, '0.026*\"radio\" + 0.026*\"network\" + 0.026*\"cognitive\" + 0.026*\"efficient\"')\n", + "(2, '0.029*\"circuit\" + 0.029*\"distribute\" + 0.016*\"database\" + 0.016*\"management\"')\n" + ] + } + ], + "source": [ + "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)\n", + "ldamodel.save('model3.gensim')\n", + "topics = ldamodel.print_topics(num_words=4)\n", + "for topic in topics:\n", + " print(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, '0.055*\"database\" + 0.055*\"system\" + 0.029*\"technical\" + 0.029*\"recursive\"')\n", + "(1, '0.038*\"distribute\" + 0.038*\"graphics\" + 0.038*\"regenerate\" + 0.038*\"exact\"')\n", + "(2, '0.055*\"management\" + 0.029*\"multiversion\" + 0.029*\"reference\" + 0.029*\"document\"')\n", + "(3, '0.046*\"circuit\" + 0.046*\"object\" + 0.046*\"generation\" + 0.046*\"transformation\"')\n", + "(4, '0.008*\"programming\" + 0.008*\"circuit\" + 0.008*\"network\" + 0.008*\"surface\"')\n", + "(5, '0.061*\"radio\" + 0.061*\"cognitive\" + 0.061*\"network\" + 0.061*\"connectivity\"')\n", + "(6, '0.085*\"programming\" + 0.008*\"circuit\" + 0.008*\"subdivision\" + 0.008*\"management\"')\n", + "(7, '0.041*\"circuit\" + 0.041*\"design\" + 0.041*\"processor\" + 0.041*\"instruction\"')\n", + "(8, '0.055*\"computer\" + 0.029*\"efficient\" + 0.029*\"channel\" + 0.029*\"cooperation\"')\n", + "(9, '0.061*\"stimulation\" + 0.061*\"sensor\" + 0.061*\"retinal\" + 0.061*\"pixel\"')\n" + ] + } + ], + "source": [ + "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)\n", + "ldamodel.save('model10.gensim')\n", + "topics = ldamodel.print_topics(num_words=4)\n", + "for topic in topics:\n", + " print(topic)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### pyLDAvis" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')\n", + "corpus = pickle.load(open('corpus.pkl', 'rb'))\n", + "lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sli/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", + " topic_term_dists = topic_term_dists.ix[topic_order]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyLDAvis.gensim\n", + "lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)\n", + "pyLDAvis.display(lda_display)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sli/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", + " topic_term_dists = topic_term_dists.ix[topic_order]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')\n", + "lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)\n", + "pyLDAvis.display(lda_display3)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sli/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: \n", + ".ix is deprecated. Please use\n", + ".loc for label based indexing or\n", + ".iloc for positional indexing\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", + " topic_term_dists = topic_term_dists.ix[topic_order]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')\n", + "lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)\n", + "pyLDAvis.display(lda_display10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/topic_modeling_Gensim.ipynb b/topic_modeling_Gensim.ipynb index afeba57..44bf54b 100644 --- a/topic_modeling_Gensim.ipynb +++ b/topic_modeling_Gensim.ipynb @@ -170,9 +170,9 @@ "with open('dataset.csv') as f:\n", " for line in f:\n", " tokens = prepare_text_for_lda(line)\n", + " text_data.append(tokens)\n", " if random.random() > .99:\n", - " print(tokens)\n", - " text_data.append(tokens)" + " print(tokens)" ] }, { @@ -594,7 +594,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.5" } }, "nbformat": 4, From 29f75a5b948d0081eff0cf0b9fb95fe898bdbab0 Mon Sep 17 00:00:00 2001 From: Li Date: Mon, 21 May 2018 14:58:49 +0800 Subject: [PATCH 2/2] Delete unnecessary checkpoints files --- .../topic_modeling_Gensim-checkpoint.ipynb | 602 ------------------ 1 file changed, 602 deletions(-) delete mode 100644 .ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb diff --git a/.ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb b/.ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb deleted file mode 100644 index 44bf54b..0000000 --- a/.ipynb_checkpoints/topic_modeling_Gensim-checkpoint.ipynb +++ /dev/null @@ -1,602 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import spacy\n", - "spacy.load('en')\n", - "from spacy.lang.en import English\n", - "parser = English()\n", - "\n", - "def tokenize(text):\n", - " lda_tokens = []\n", - " tokens = parser(text)\n", - " for token in tokens:\n", - " if token.orth_.isspace():\n", - " continue\n", - " elif token.like_url:\n", - " lda_tokens.append('URL')\n", - " elif token.orth_.startswith('@'):\n", - " lda_tokens.append('SCREEN_NAME')\n", - " else:\n", - " lda_tokens.append(token.lower_)\n", - " return lda_tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package wordnet to /Users/sli/nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import nltk\n", - "nltk.download('wordnet')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.corpus import wordnet as wn\n", - "def get_lemma(word):\n", - " lemma = wn.morphy(word)\n", - " if lemma is None:\n", - " return word\n", - " else:\n", - " return lemma\n", - " \n", - "from nltk.stem.wordnet import WordNetLemmatizer\n", - "def get_lemma2(word):\n", - " return WordNetLemmatizer().lemmatize(word)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "dogs dog dog\n", - "ran run ran\n", - "discouraged discourage discouraged\n" - ] - } - ], - "source": [ - "for w in ['dogs', 'ran', 'discouraged']:\n", - " print(w, get_lemma(w), get_lemma2(w))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /Users/sli/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - } - ], - "source": [ - "nltk.download('stopwords')\n", - "en_stop = set(nltk.corpus.stopwords.words('english'))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_text_for_lda(text):\n", - " tokens = tokenize(text)\n", - " tokens = [token for token in tokens if len(token) > 4]\n", - " tokens = [token for token in tokens if token not in en_stop]\n", - " tokens = [get_lemma(token) for token in tokens]\n", - " return tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['rigorous', 'integration', 'piece', 'linear', 'continuous', 'system']\n", - "['shape', 'transformation', 'polyhedral', 'object']\n", - "['search', 'challenge', 'opportunity']\n", - "['programming']\n", - "['efficient', 'resource', 'allocation', 'flexible', 'channel', 'cooperation', 'ofdma', 'cognitive', 'radio', 'network']\n", - "['connectivity', 'large', 'scale', 'cognitive', 'radio', 'network']\n", - "['hardware', 'accelerate', 'shaders', 'using']\n", - "['domain', 'circuit', 'model', 'chemical', 'reaction']\n", - "['bin', 'algorithm', 'accurate', 'computer', 'aid', 'device', 'modeling']\n", - "['output', 'perturbation', 'query', 'relaxation']\n", - "['parallel', 'processor', 'architecture', 'graphics', 'arithmetic', 'operations']\n", - "['efficient', 'management', 'multiversion', 'document', 'object', 'reference']\n", - "['exact', 'regenerate', 'code', 'byzantine', 'fault', 'tolerance', 'distribute', 'storage']\n", - "['offset', 'cancellation', 'crossing', 'base', 'circuit']\n", - "['image', 'sensor', 'spike', 'pixel', 'retinal', 'stimulation']\n", - "['hardness', 'approximation', 'survivable', 'multi', 'level', 'problem']\n", - "['issue', 'distribute', 'database', 'management', 'system', 'technical', 'overview']\n", - "['uniform', 'recursive', 'subdivision', 'surface']\n", - "['stack', 'memory', 'design', 'instruction', 'folding', 'processor']\n", - "['framework', 'optimal', 'battery', 'management', 'wireless', 'node']\n", - "['selection', 'reduce', 'encoding', 'complexity', 'h.264/avc']\n", - "['speed', 'class', 'current', 'circuit']\n", - "['design', 'gallery', 'general', 'approach', 'setting', 'parameter', 'computer', 'graphics', 'animation']\n", - "['simple', 'realistic', 'generation']\n", - "['toward', 'practical', 'constraint', 'database']\n" - ] - } - ], - "source": [ - "import random\n", - "text_data = []\n", - "with open('dataset.csv') as f:\n", - " for line in f:\n", - " tokens = prepare_text_for_lda(line)\n", - " text_data.append(tokens)\n", - " if random.random() > .99:\n", - " print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from gensim import corpora\n", - "dictionary = corpora.Dictionary(text_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "corpus = [dictionary.doc2bow(text) for text in text_data]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "pickle.dump(corpus, open('corpus.pkl', 'wb'))\n", - "dictionary.save('dictionary.gensim')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Try 5 topics" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import gensim\n", - "NUM_TOPICS = 5\n", - "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)\n", - "ldamodel.save('model5.gensim')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(0, '0.034*\"processor\" + 0.019*\"database\" + 0.019*\"issue\" + 0.019*\"overview\"')\n", - "(1, '0.051*\"computer\" + 0.028*\"design\" + 0.028*\"graphics\" + 0.028*\"gallery\"')\n", - "(2, '0.050*\"management\" + 0.027*\"object\" + 0.027*\"circuit\" + 0.027*\"efficient\"')\n", - "(3, '0.019*\"cognitive\" + 0.019*\"radio\" + 0.019*\"network\" + 0.019*\"distribute\"')\n", - "(4, '0.029*\"circuit\" + 0.029*\"system\" + 0.029*\"rigorous\" + 0.029*\"integration\"')\n" - ] - } - ], - "source": [ - "topics = ldamodel.print_topics(num_words=4)\n", - "for topic in topics:\n", - " print(topic)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(38, 1), (117, 1)]\n", - "[(0, 0.06669136), (1, 0.40170625), (2, 0.06670282), (3, 0.39819494), (4, 0.066704586)]\n" - ] - } - ], - "source": [ - "new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'\n", - "new_doc = prepare_text_for_lda(new_doc)\n", - "new_doc_bow = dictionary.doc2bow(new_doc)\n", - "print(new_doc_bow)\n", - "print(ldamodel.get_document_topics(new_doc_bow))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(0, '0.029*\"processor\" + 0.016*\"management\" + 0.016*\"aid\" + 0.016*\"algorithm\"')\n", - "(1, '0.026*\"radio\" + 0.026*\"network\" + 0.026*\"cognitive\" + 0.026*\"efficient\"')\n", - "(2, '0.029*\"circuit\" + 0.029*\"distribute\" + 0.016*\"database\" + 0.016*\"management\"')\n" - ] - } - ], - "source": [ - "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)\n", - "ldamodel.save('model3.gensim')\n", - "topics = ldamodel.print_topics(num_words=4)\n", - "for topic in topics:\n", - " print(topic)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(0, '0.055*\"database\" + 0.055*\"system\" + 0.029*\"technical\" + 0.029*\"recursive\"')\n", - "(1, '0.038*\"distribute\" + 0.038*\"graphics\" + 0.038*\"regenerate\" + 0.038*\"exact\"')\n", - "(2, '0.055*\"management\" + 0.029*\"multiversion\" + 0.029*\"reference\" + 0.029*\"document\"')\n", - "(3, '0.046*\"circuit\" + 0.046*\"object\" + 0.046*\"generation\" + 0.046*\"transformation\"')\n", - "(4, '0.008*\"programming\" + 0.008*\"circuit\" + 0.008*\"network\" + 0.008*\"surface\"')\n", - "(5, '0.061*\"radio\" + 0.061*\"cognitive\" + 0.061*\"network\" + 0.061*\"connectivity\"')\n", - "(6, '0.085*\"programming\" + 0.008*\"circuit\" + 0.008*\"subdivision\" + 0.008*\"management\"')\n", - "(7, '0.041*\"circuit\" + 0.041*\"design\" + 0.041*\"processor\" + 0.041*\"instruction\"')\n", - "(8, '0.055*\"computer\" + 0.029*\"efficient\" + 0.029*\"channel\" + 0.029*\"cooperation\"')\n", - "(9, '0.061*\"stimulation\" + 0.061*\"sensor\" + 0.061*\"retinal\" + 0.061*\"pixel\"')\n" - ] - } - ], - "source": [ - "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)\n", - "ldamodel.save('model10.gensim')\n", - "topics = ldamodel.print_topics(num_words=4)\n", - "for topic in topics:\n", - " print(topic)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### pyLDAvis" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')\n", - "corpus = pickle.load(open('corpus.pkl', 'rb'))\n", - "lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sli/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: \n", - ".ix is deprecated. Please use\n", - ".loc for label based indexing or\n", - ".iloc for positional indexing\n", - "\n", - "See the documentation here:\n", - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", - " topic_term_dists = topic_term_dists.ix[topic_order]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pyLDAvis.gensim\n", - "lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)\n", - "pyLDAvis.display(lda_display)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sli/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: \n", - ".ix is deprecated. Please use\n", - ".loc for label based indexing or\n", - ".iloc for positional indexing\n", - "\n", - "See the documentation here:\n", - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", - " topic_term_dists = topic_term_dists.ix[topic_order]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')\n", - "lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)\n", - "pyLDAvis.display(lda_display3)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sli/anaconda3/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: \n", - ".ix is deprecated. Please use\n", - ".loc for label based indexing or\n", - ".iloc for positional indexing\n", - "\n", - "See the documentation here:\n", - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n", - " topic_term_dists = topic_term_dists.ix[topic_order]\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')\n", - "lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)\n", - "pyLDAvis.display(lda_display10)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}