-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfcts.py
27 lines (20 loc) · 785 Bytes
/
fcts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
# split sentence into array of words(word,punctuation character, or number)
def tokenize(sentence):
return nltk.word_tokenize(sentence)
# stemming is finding the root form of the word
def stem(word):
return stemmer.stem(word.lower())
# return bag of words array : 1 for each known word that exists in the sentence, 0 otherwise
def bag_of_words(tokenized_sentence, words):
# stem each word
sentence_words = [stem(word) for word in tokenized_sentence]
# initialize bag with 0 for each word
bag = np.zeros(len(words), dtype=np.float32)
for idx, w in enumerate(words):
if w in sentence_words:
bag[idx] = 1
return bag