Skip to content

Commit 79b38df

Browse files
authored
Add LDA file
1 parent a4e07e0 commit 79b38df

File tree

1 file changed

+127
-0
lines changed

1 file changed

+127
-0
lines changed

LDA.ipynb

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 37,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from nltk.tokenize import RegexpTokenizer\n",
10+
"from stop_words import get_stop_words\n",
11+
"from nltk.stem.porter import PorterStemmer\n",
12+
"from gensim import corpora, models\n",
13+
"import gensim"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 44,
19+
"metadata": {
20+
"collapsed": true
21+
},
22+
"outputs": [],
23+
"source": [
24+
"tokenizer = RegexpTokenizer(r'\\w+')\n",
25+
"\n",
26+
"# create English stop words list\n",
27+
"en_stop = get_stop_words('en')\n",
28+
"\n",
29+
"# Create p_stemmer of class PorterStemmer\n",
30+
"p_stemmer = PorterStemmer()\n",
31+
" \n",
32+
"# create sample documents\n",
33+
"doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n",
34+
"doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n",
35+
"doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n",
36+
"doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n",
37+
"doc_e = \"Health professionals say that brocolli is good for your health.\" \n",
38+
"\n",
39+
"# compile sample documents into a list\n",
40+
"doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n",
41+
"\n",
42+
"# list for tokenized documents in loop\n",
43+
"texts = []\n",
44+
"\n",
45+
"# loop through document list\n",
46+
"for i in doc_set:\n",
47+
" \n",
48+
" # clean and tokenize document string\n",
49+
" raw = i.lower()\n",
50+
" tokens = tokenizer.tokenize(raw)\n",
51+
"\n",
52+
" # remove stop words from tokens\n",
53+
" stopped_tokens = [i for i in tokens if not i in en_stop]\n",
54+
" \n",
55+
" # stem tokens\n",
56+
" stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n",
57+
" \n",
58+
" # add tokens to list\n",
59+
" texts.append(stemmed_tokens)\n",
60+
"\n",
61+
"# turn our tokenized documents into a id <-> term dictionary\n",
62+
"dictionary = corpora.Dictionary(texts)\n",
63+
" \n",
64+
"# convert tokenized documents into a document-term matrix\n",
65+
"corpus = [dictionary.doc2bow(text) for text in texts]\n",
66+
"\n",
67+
"# generate LDA model\n",
68+
"ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": 45,
74+
"metadata": {},
75+
"outputs": [
76+
{
77+
"name": "stdout",
78+
"output_type": "stream",
79+
"text": [
80+
"[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n"
81+
]
82+
}
83+
],
84+
"source": [
85+
"print(ldamodel.print_topics(num_topics=2, num_words=4))"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 47,
91+
"metadata": {},
92+
"outputs": [
93+
{
94+
"name": "stdout",
95+
"output_type": "stream",
96+
"text": [
97+
"[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n"
98+
]
99+
}
100+
],
101+
"source": [
102+
"print(ldamodel.print_topics(num_topics=3, num_words=3))"
103+
]
104+
}
105+
],
106+
"metadata": {
107+
"kernelspec": {
108+
"display_name": "Python 3",
109+
"language": "python",
110+
"name": "python3"
111+
},
112+
"language_info": {
113+
"codemirror_mode": {
114+
"name": "ipython",
115+
"version": 3
116+
},
117+
"file_extension": ".py",
118+
"mimetype": "text/x-python",
119+
"name": "python",
120+
"nbconvert_exporter": "python",
121+
"pygments_lexer": "ipython3",
122+
"version": "3.6.1"
123+
}
124+
},
125+
"nbformat": 4,
126+
"nbformat_minor": 2
127+
}

0 commit comments

Comments
 (0)