8
8
from nltk .corpus import brown as nltk_words
9
9
from nltk .corpus import words as eng_dict
10
10
from numpy .linalg import norm
11
- from octis . dataset . dataset import Dataset as OCDataset
11
+ from .. utils . dataset import TMDataset
12
12
13
13
from ._embedder import BaseEmbedder
14
14
@@ -37,7 +37,7 @@ def __init__(
37
37
self .embedder = BaseEmbedder (embedding_model )
38
38
self .n_topics = n_topics
39
39
40
- def _noun_extractor_haystack (self , embeddings , n , corpus = "octis " , only_nouns = True ):
40
+ def _noun_extractor_haystack (self , embeddings , n , corpus = "brown " , only_nouns = True ):
41
41
"""
42
42
Extracts the topics most probable words, which are the words nearest to the topics centroid.
43
43
We extract all nouns from the corpus and the brown corpus. Afterwards we compute the cosine similarity between every word and every centroid.
@@ -51,6 +51,7 @@ def _noun_extractor_haystack(self, embeddings, n, corpus="octis", only_nouns=Tru
51
51
Args:
52
52
embeddings (_type_): _document embeddings to compute centroid of the topic
53
53
n (_type_): n_top number of words per topic
54
+ corpus (str, optional): corpus to be used for word extraction. Defaults to "brown". One of "brown", "stream", "words".
54
55
55
56
Returns:
56
57
dict: extracted topics
@@ -67,21 +68,21 @@ def is_noun(pos):
67
68
if corpus == "brown" :
68
69
word_list = nltk_words .words ()
69
70
word_list = [word .lower ().strip () for word in word_list ]
70
- word_list = [re .sub (r"[^a-zA-Z0-9]+\s*" , "" , word )
71
- for word in word_list ]
71
+ word_list = [re .sub (r"[^a-zA-Z0-9]+\s*" , "" , word ) for word in word_list ]
72
72
elif corpus == "words" :
73
73
word_list = eng_dict .words ()
74
74
word_list = [word .lower ().strip () for word in word_list ]
75
- word_list = [re .sub (r"[^a-zA-Z0-9]+\s*" , "" , word )
76
- for word in word_list ]
77
- elif corpus == "octis" :
78
- data = OCDataset ()
79
- data .fetch_dataset ("20NewsGroup" )
75
+ word_list = [re .sub (r"[^a-zA-Z0-9]+\s*" , "" , word ) for word in word_list ]
76
+ elif corpus == "stream" :
77
+ data = TMDataset ()
78
+ data .fetch_dataset ("20NewsGroups" )
80
79
word_list = data .get_vocabulary ()
81
- data .fetch_dataset ("M10 " )
80
+ data .fetch_dataset ("Spotify " )
82
81
word_list += data .get_vocabulary ()
83
82
data .fetch_dataset ("BBC_News" )
84
83
word_list += data .get_vocabulary ()
84
+ data .fetch_dataset ("Poliblogs" )
85
+ word_list += data .get_vocabulary ()
85
86
86
87
# include reuters etc datasets
87
88
# data.load_custom_dataset_from_folder(DATADIR + "/GN")
@@ -90,16 +91,14 @@ def is_noun(pos):
90
91
word_list += self .dataset .get_vocabulary ()
91
92
92
93
word_list = [word .lower ().strip () for word in word_list ]
93
- word_list = [re .sub (r"[^a-zA-Z0-9]+\s*" , "" , word )
94
- for word in word_list ]
94
+ word_list = [re .sub (r"[^a-zA-Z0-9]+\s*" , "" , word ) for word in word_list ]
95
95
else :
96
96
raise ValueError (
97
97
"There are no words to be extracted for the Topics: Please specify a corpus"
98
98
)
99
99
100
100
if only_nouns :
101
- word_list = [word for (word , pos ) in pos_tag (
102
- word_list ) if is_noun (pos )]
101
+ word_list = [word for (word , pos ) in pos_tag (word_list ) if is_noun (pos )]
103
102
else :
104
103
word_list = [word for (word , pos ) in pos_tag (word_list )]
105
104
0 commit comments