Skip to content

Commit df5dbbe

Browse files
committed
change default expansion corpus
1 parent c7348a5 commit df5dbbe

File tree

2 files changed

+15
-16
lines changed

2 files changed

+15
-16
lines changed

stream_topic/models/CEDC.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def fit(
191191
only_nouns: bool = False,
192192
clean: bool = False,
193193
clean_threshold: float = 0.85,
194-
expansion_corpus: str = "octis",
194+
expansion_corpus: str = "brown",
195195
n_words: int = 20,
196196
):
197197
"""
@@ -210,7 +210,7 @@ def fit(
210210
clean_threshold : float, optional
211211
Threshold for cleaning topics based on similarity (default is 0.85).
212212
expansion_corpus : str, optional
213-
Corpus for expanding topics (default is 'octis').
213+
Corpus for expanding topics (default is 'brown').
214214
n_words : int, optional
215215
Number of top words to include in each topic (default is 20).
216216

stream_topic/preprocessor/topic_extraction.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from nltk.corpus import brown as nltk_words
99
from nltk.corpus import words as eng_dict
1010
from numpy.linalg import norm
11-
from octis.dataset.dataset import Dataset as OCDataset
11+
from ..utils.dataset import TMDataset
1212

1313
from ._embedder import BaseEmbedder
1414

@@ -37,7 +37,7 @@ def __init__(
3737
self.embedder = BaseEmbedder(embedding_model)
3838
self.n_topics = n_topics
3939

40-
def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=True):
40+
def _noun_extractor_haystack(self, embeddings, n, corpus="brown", only_nouns=True):
4141
"""
4242
Extracts the topics most probable words, which are the words nearest to the topics centroid.
4343
We extract all nouns from the corpus and the brown corpus. Afterwards we compute the cosine similarity between every word and every centroid.
@@ -51,6 +51,7 @@ def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=Tru
5151
Args:
5252
embeddings (_type_): _document embeddings to compute centroid of the topic
5353
n (_type_): n_top number of words per topic
54+
corpus (str, optional): corpus to be used for word extraction. Defaults to "brown". One of "brown", "stream", "words".
5455
5556
Returns:
5657
dict: extracted topics
@@ -67,21 +68,21 @@ def is_noun(pos):
6768
if corpus == "brown":
6869
word_list = nltk_words.words()
6970
word_list = [word.lower().strip() for word in word_list]
70-
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word)
71-
for word in word_list]
71+
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list]
7272
elif corpus == "words":
7373
word_list = eng_dict.words()
7474
word_list = [word.lower().strip() for word in word_list]
75-
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word)
76-
for word in word_list]
77-
elif corpus == "octis":
78-
data = OCDataset()
79-
data.fetch_dataset("20NewsGroup")
75+
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list]
76+
elif corpus == "stream":
77+
data = TMDataset()
78+
data.fetch_dataset("20NewsGroups")
8079
word_list = data.get_vocabulary()
81-
data.fetch_dataset("M10")
80+
data.fetch_dataset("Spotify")
8281
word_list += data.get_vocabulary()
8382
data.fetch_dataset("BBC_News")
8483
word_list += data.get_vocabulary()
84+
data.fetch_dataset("Poliblogs")
85+
word_list += data.get_vocabulary()
8586

8687
# include reuters etc datasets
8788
# data.load_custom_dataset_from_folder(DATADIR + "/GN")
@@ -90,16 +91,14 @@ def is_noun(pos):
9091
word_list += self.dataset.get_vocabulary()
9192

9293
word_list = [word.lower().strip() for word in word_list]
93-
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word)
94-
for word in word_list]
94+
word_list = [re.sub(r"[^a-zA-Z0-9]+\s*", "", word) for word in word_list]
9595
else:
9696
raise ValueError(
9797
"There are no words to be extracted for the Topics: Please specify a corpus"
9898
)
9999

100100
if only_nouns:
101-
word_list = [word for (word, pos) in pos_tag(
102-
word_list) if is_noun(pos)]
101+
word_list = [word for (word, pos) in pos_tag(word_list) if is_noun(pos)]
103102
else:
104103
word_list = [word for (word, pos) in pos_tag(word_list)]
105104

0 commit comments

Comments
 (0)