-
Notifications
You must be signed in to change notification settings - Fork 0
/
mapper.py
62 lines (54 loc) · 1.97 KB
/
mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import logging
class Mapper:
text = None
def __init__(self):
self.word_count_dict = {}
self.inverted_index_dict = {}
def word_count_mapper(self, text):
try:
logging.info("Word Count mapper")
text = str(text).lower()
clean_text = self.clean_text(text)
for word in clean_text.split():
if word not in self.word_count_dict:
self.word_count_dict[word] = 1
else:
self.word_count_dict[word] += 1
return self.word_count_dict
except Exception as e:
logging.error(str(e))
raise e
def inverted_index_mapper(self, file_content_dict):
try:
logging.info("Inverted Index mapper")
self.inverted_index_dict = {}
for item in file_content_dict.items():
filename = item[0]
text = item[1]
text = str(text).lower()
clean_text = self.clean_text(text)
for word in clean_text.split(" "):
key = word + "-" + filename
if key not in self.inverted_index_dict:
self.inverted_index_dict[key] = 1
else:
self.inverted_index_dict[key] += 1
return self.inverted_index_dict
except Exception as e:
logging.error(str(e))
raise e
@staticmethod
def clean_text(text):
try:
logging.info("Cleaning Mapper Text")
clean_text = ""
words = text.split(" ")
for i in range(len(words)):
if words[i].isalnum() and i == 0:
clean_text = words[i]
elif words[i].isalnum() and len(words):
clean_text = clean_text + " " + words[i]
return clean_text
except Exception as e:
logging.error(str(e))
raise e