Separated the all_solution() method from words.py by creating words_offline.py

pncnmnp · pncnmnp · commit 52176b697c4d · 2019-12-20T16:38:32.000+05:30
Purpose: with this commit, words.py returns a solution which requires
         Moby's thesaurus, gensim's glove-wiki-gigaword-100 and nltk's Wordnet
         whereas words_offline.py returns a solution which requires all-clues.bz2
diff --git a/clues.json b/clues.json
@@ -1 +1 @@
-"{'__ of bad news': ['sitson', 'edible', 'better', 'imsory', 'yerout', 'ashame', 'lesser', 'goodor', 'severe', 'isgood', 'oopsie', 'noroom', 'rotten', 'pileup', 'nohits', 'odious', 'recall', 'itssad', 'bearer', 'rancid', 'decent', 'nosale', 'whoops', 'rialto', 'grimly', 'delays', 'illuse'], 'Posture problem': ['issue', 'tough', 'allok', 'stoop', 'asnap', 'poser', 'whats', 'minds', 'sorry', 'seems', 'cando', 'itsok'], 'Loads': ['rearms', 'plenty', 'oodles', 'cargos', 'oceans', 'adored', 'roadie', 'adores', 'onuses', 'washes', 'scores'], 'Laundry appliance': ['drier', 'dryer'], 'Lectured': ['spoke', 'pupil'], 'One who weeps': ['group', 'trust', 'sided', 'twoto', 'arent', 'oweme', 'these', 'cuber', 'digit', 'ahead', 'asoul', 'alike', 'cryer', 'lucky', 'akind', 'their', 'alone', 'longe', 'acter', 'rewed', 'equal', 'atime', 'ortwo', 'sixof', 'bogey', 'icare', 'alien', 'crier', 'wedto', 'shirt', 'again', 'admit', 'whine', 'nonot', 'oneto', 'which', 'oneby', 'ahalf', 'units', 'title', 'tryit', 'fiber', 'model', 'iwant', 'vowel', 'unite', 'among', 'idiom', 'riser', 'cries', 'along', 'agree', 'piece', 'grade', 'excon', 'groan', 'loser', 'tento', 'puzle', 'those', 'orthe', 'every', 'motto', 'owner', 'satan', 'sroot'], 'Grassy clump': ['sod', 'wad'], 'Pie chart portion': ['sector'], '\"Scary Movie,\" e.g.': ['teaser', 'scream', 'scarer', 'promos', 'rental', 'weeper', 'sequel', 'parody'], \"Maryland's state bird\": ['grouse', 'thrush', 'oriole'], 'Something worth saving': ['usable', 'assets', 'keeper'], '\"To __ is human\"': ['ist', 'aah', 'eve', 'ity', 'sin', 'cpa', 'err', 'men', 'ism', 'all', 'art', 'ape', 'oid', 'soc', 'spy', 'lap', 'man', 'jon', 'arm']}"
+"{'__ of bad news': ['ashame', 'bearer', 'better', 'decent', 'delays', 'edible', 'goodor', 'grimly', 'illuse', 'imsory', 'isgood', 'itssad', 'lesser', 'nohits', 'noroom', 'nosale', 'odious', 'oopsie', 'pileup', 'rancid', 'recall', 'rialto', 'rotten', 'severe', 'sitson', 'whoops', 'yerout'], 'Posture problem': ['allok', 'asnap', 'cando', 'issue', 'itsok', 'minds', 'poser', 'seems', 'sorry', 'stoop', 'tough', 'whats'], 'Loads': ['adored', 'adores', 'cargos', 'oceans', 'onuses', 'oodles', 'plenty', 'rearms', 'roadie', 'scores', 'washes'], 'Laundry appliance': ['drier', 'dryer'], 'Lectured': ['pupil', 'spoke'], 'One who weeps': ['acter', 'admit', 'again', 'agree', 'ahalf', 'ahead', 'akind', 'alien', 'alike', 'alone', 'along', 'among', 'arent', 'asoul', 'atime', 'bogey', 'crier', 'cries', 'cryer', 'cuber', 'digit', 'equal', 'every', 'excon', 'fiber', 'grade', 'groan', 'group', 'icare', 'idiom', 'iwant', 'longe', 'loser', 'lucky', 'model', 'motto', 'nonot', 'oneby', 'oneto', 'orthe', 'ortwo', 'oweme', 'owner', 'piece', 'puzle', 'rewed', 'riser', 'satan', 'shirt', 'sided', 'sixof', 'sroot', 'tento', 'their', 'these', 'those', 'title', 'trust', 'tryit', 'twoto', 'unite', 'units', 'vowel', 'wedto', 'which', 'whine'], 'Grassy clump': ['sod', 'wad'], 'Pie chart portion': ['sector'], '\"Scary Movie,\" e.g.': ['parody', 'promos', 'rental', 'scarer', 'scream', 'sequel', 'teaser', 'weeper'], \"Maryland's state bird\": ['grouse', 'oriole', 'thrush'], 'Something worth saving': ['assets', 'keeper', 'usable'], '\"To __ is human\"': ['aah', 'all', 'ape', 'arm', 'art', 'cpa', 'err', 'eve', 'ism', 'ist', 'ity', 'jon', 'lap', 'man', 'men', 'oid', 'sin', 'soc', 'spy']}"
diff --git a/words.py b/words.py
@@ -2,16 +2,14 @@
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from ast import literal_eval
-from collections import Counter
 import gensim.downloader as api
 from file_path import *
+from schema import CROSSWORD_GRID
 import pickle
 import string
 import requests
 import inflect # Library used to check whether a sentence is singular or plural
 import json
-import math
-import re
 
 """
 TODO: >> Plural detection and conversion [done]
@@ -158,57 +156,6 @@ def sentence_solution(self, sentence_clues, clues):
 
 		return clue_mapping
 
-	def all_solution(self, clues):
-		stop = stopwords.words('english') + [""]
-
-		with open(ALL_CLUES, encoding="latin-1") as fp:
-			dict_guesses = fp.readlines()
-
-		clue_mapping = dict()
-		all_lengths = []
-		for clue in clues:
-			clue_mapping[clue] = list()
-			if clues[clue] not in all_lengths:
-				all_lengths.append(clues[clue])
-
-		clue_statements = list(clues.keys())
-		clue_vecs = dict()
-		for clue in clue_statements:
-			clue_vecs[clue] = [word for word in [word.strip(string.punctuation) for word in clue.lower().split()] if word not in stop]
-
-		print(">>> STARTING ALL CLUES FETCH (V.1).....")
-		for guess in dict_guesses:
-			if len(guess.split()[0]) not in all_lengths:
-				continue
-
-			guess_statement = " ".join(guess.split()[4:])
-			guess_vec = Counter([word for word in [word.strip(string.punctuation) for word in guess_statement.lower().split()] if word not in stop])
-
-			for clue in clue_statements:
-				if len(guess.split()[0]) == clues[clue]:
-					clue_vec = Counter(clue_vecs[clue])
-
-					# https://stackoverflow.com/questions/15173225/calculate-cosine-similarity-given-2-sentence-strings
-					intersection = set(guess_vec.keys()) & set(clue_vec.keys())
-					numerator = sum([guess_vec[x] * clue_vec[x] for x in intersection])
-
-					sum1 = sum([guess_vec[x]**2 for x in guess_vec.keys()])
-					sum2 = sum([clue_vec[x]**2 for x in clue_vec.keys()])
-					denominator = math.sqrt(sum1) * math.sqrt(sum2)
-
-					if not denominator:
-						sim =  0.0
-					else:
-						sim = float(numerator) / denominator
-
-					if sim > 0.65:
-						clue_mapping[clue] += [guess.split()[0].lower()]
-
-		for clue in clues:
-			clue_mapping[clue] = list(set(clue_mapping[clue]))				
-
-		return clue_mapping
-
 	def one_word_solution(self, one_word_clues, clues):
 		fp = open(MOBY_PATH)
 		moby_lines = fp.readlines()
@@ -289,11 +236,9 @@ def store_words(self, one_word_solved, one_word_clues, sentence_solved, wikipedi
 		for key in list(sentence_solved.keys()):
 			try:
 				clues[key] += [word[0] for word in sentence_solved[key]]
-				clues[key] += wikipedia_solved[key]
 			except:
 				clues[key] = list()
 				clues[key] += [word[0] for word in sentence_solved[key]]
-				clues[key] += wikipedia_solved[key]
 
 		for key in list(wikipedia_solved.keys()):
 			try:
@@ -320,31 +265,31 @@ def fetch_words(self, clues):
 		    Param: clues - dict
 		           {clue_1: word_len_1, clue_2: word_len_2}
 		"""
-		# all_clues = list(clues.keys())
-		# one_word_clues = [(clue.lower(),clue) for clue in all_clues if len(clue.split(" ")) == 1]
-
-		# # converting words such as extra-large into large
-		# one_word_clues += [(clue.split("-")[-1].lower(),clue) for clue in all_clues 
-		# 						if ("-" in clue) and (len(clue.split("-"))) == 2]
-		# one_word_solved = self.one_word_solution_alternate([clue[0] for clue in one_word_clues], clues)
-
-		# sentence_clues = list(set(all_clues).difference(set(one_word_clues)))
-		# sentence_solved = self.sentence_solution(sentence_clues, clues)
-
-		# wikipedia_clues = list()
-		# # Print top N results
-		# N = 40
-		# for clue in sentence_solved:
-		# 	sentence_solved[clue] = sentence_solved[clue][:N]
-
-		# wikipedia_solved = self.wikipedia_solution(sentence_clues, clues)
-		all_solved = self.all_solution(clues)
-		print(all_solved)
-		print(">>> STORED CLUES.....")
-		with open(CLUES_PATH, "w") as fp:
-			json.dump(str(all_solved), fp)
+		all_clues = list(clues.keys())
+		one_word_clues = [(clue.lower(),clue) for clue in all_clues if len(clue.split(" ")) == 1]
+
+		# converting words such as extra-large into large
+		one_word_clues += [(clue.split("-")[-1].lower(),clue) for clue in all_clues 
+								if ("-" in clue) and (len(clue.split("-"))) == 2]
+		one_word_solved = self.one_word_solution_alternate([clue[0] for clue in one_word_clues], clues)
+
+		sentence_clues = list(set(all_clues).difference(set(one_word_clues)))
+		sentence_solved = self.sentence_solution(sentence_clues, clues)
+
+		wikipedia_clues = list()
+		# Print top N results
+		N = 40
+		for clue in sentence_solved:
+			sentence_solved[clue] = sentence_solved[clue][:N]
+
+		wikipedia_solved = self.wikipedia_solution(sentence_clues, clues)
+
+		self.store_words(one_word_solved, one_word_clues, sentence_solved, wikipedia_solved)
 
 if __name__ == '__main__':
-	# THE WORDS INSERTED SHOULD HAVE THEIR STARTING LETTER CAPITALIZED
-	# Words().fetch_words({"A type of cheese": 4, "Indian Grandmaster": 5, "A small european singing bird": 5, "A plant of the cabbage species": 8, "Director of Raging Bull": 8, "Fireplace": 7, "A popular game character created by Shigeru Miyamoto": 5, "Author who created Sherlock Holmes": 5, "The science of life": 7, "Used for baking or roasting": 4})
-	Words().fetch_words({"__ of bad news": 6, "Posture problem": 5, "Loads": 6, "Laundry appliance": 5, "Lectured": 5, "One who weeps": 5, "Grassy clump": 3, "Pie chart portion": 6, "\"Scary Movie,\" e.g.": 6, "Maryland's state bird": 6, "Something worth saving": 6, "\"To __ is human\"": 3})
+	grid = CROSSWORD_GRID
+	clues = dict()
+	for clue in CROSSWORD_GRID:
+		clues[clue] = CROSSWORD_GRID[clue]["length"]
+
+	Words().fetch_words(clues)
diff --git a/words_offline.py b/words_offline.py
@@ -0,0 +1,77 @@
+from nltk.corpus import stopwords
+from collections import Counter
+from schema import CROSSWORD_GRID
+from file_path import *
+import string
+import math
+import re
+import json
+
+class Words_Offline():
+	def __init__(self):
+		pass
+
+	def all_solution(self, clues):
+		stop = stopwords.words('english') + [""]
+
+		with open(ALL_CLUES, encoding="latin-1") as fp:
+			dict_guesses = fp.readlines()
+
+		clue_mapping = dict()
+		all_lengths = []
+		for clue in clues:
+			clue_mapping[clue] = list()
+			if clues[clue] not in all_lengths:
+				all_lengths.append(clues[clue])
+
+		clue_statements = list(clues.keys())
+		clue_vecs = dict()
+		for clue in clue_statements:
+			clue_vecs[clue] = [word for word in [word.strip(string.punctuation) for word in clue.lower().split()] if word not in stop]
+
+		print(">>> STARTING ALL CLUES FETCH (V.1).....")
+		for guess in dict_guesses:
+			if len(guess.split()[0]) not in all_lengths:
+				continue
+
+			guess_statement = " ".join(guess.split()[4:])
+			guess_vec = Counter([word for word in [word.strip(string.punctuation) for word in guess_statement.lower().split()] if word not in stop])
+
+			for clue in clue_statements:
+				if len(guess.split()[0]) == clues[clue]:
+					clue_vec = Counter(clue_vecs[clue])
+
+					# https://stackoverflow.com/questions/15173225/calculate-cosine-similarity-given-2-sentence-strings
+					intersection = set(guess_vec.keys()) & set(clue_vec.keys())
+					numerator = sum([guess_vec[x] * clue_vec[x] for x in intersection])
+
+					sum1 = sum([guess_vec[x]**2 for x in guess_vec.keys()])
+					sum2 = sum([clue_vec[x]**2 for x in clue_vec.keys()])
+					denominator = math.sqrt(sum1) * math.sqrt(sum2)
+
+					if not denominator:
+						sim =  0.0
+					else:
+						sim = float(numerator) / denominator
+
+					if sim > 0.65:
+						clue_mapping[clue] += [guess.split()[0].lower()]
+
+		for clue in clues:
+			clue_mapping[clue] = list(set(clue_mapping[clue]))				
+
+		return clue_mapping
+
+	def fetch_words(self, clues):
+		all_solved = self.all_solution(clues)
+		print(">>> STORED CLUES.....")
+		with open(CLUES_PATH, "w") as fp:
+			json.dump(str(all_solved), fp)
+
+if __name__ == '__main__':
+	grid = CROSSWORD_GRID
+	clues = dict()
+	for clue in CROSSWORD_GRID:
+		clues[clue] = CROSSWORD_GRID[clue]["length"]
+
+	Words_Offline().fetch_words(clues)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-"{'__ of bad news': ['sitson', 'edible', 'better', 'imsory', 'yerout', 'ashame', 'lesser', 'goodor', 'severe', 'isgood', 'oopsie', 'noroom', 'rotten', 'pileup', 'nohits', 'odious', 'recall', 'itssad', 'bearer', 'rancid', 'decent', 'nosale', 'whoops', 'rialto', 'grimly', 'delays', 'illuse'], 'Posture problem': ['issue', 'tough', 'allok', 'stoop', 'asnap', 'poser', 'whats', 'minds', 'sorry', 'seems', 'cando', 'itsok'], 'Loads': ['rearms', 'plenty', 'oodles', 'cargos', 'oceans', 'adored', 'roadie', 'adores', 'onuses', 'washes', 'scores'], 'Laundry appliance': ['drier', 'dryer'], 'Lectured': ['spoke', 'pupil'], 'One who weeps': ['group', 'trust', 'sided', 'twoto', 'arent', 'oweme', 'these', 'cuber', 'digit', 'ahead', 'asoul', 'alike', 'cryer', 'lucky', 'akind', 'their', 'alone', 'longe', 'acter', 'rewed', 'equal', 'atime', 'ortwo', 'sixof', 'bogey', 'icare', 'alien', 'crier', 'wedto', 'shirt', 'again', 'admit', 'whine', 'nonot', 'oneto', 'which', 'oneby', 'ahalf', 'units', 'title', 'tryit', 'fiber', 'model', 'iwant', 'vowel', 'unite', 'among', 'idiom', 'riser', 'cries', 'along', 'agree', 'piece', 'grade', 'excon', 'groan', 'loser', 'tento', 'puzle', 'those', 'orthe', 'every', 'motto', 'owner', 'satan', 'sroot'], 'Grassy clump': ['sod', 'wad'], 'Pie chart portion': ['sector'], '\"Scary Movie,\" e.g.': ['teaser', 'scream', 'scarer', 'promos', 'rental', 'weeper', 'sequel', 'parody'], \"Maryland's state bird\": ['grouse', 'thrush', 'oriole'], 'Something worth saving': ['usable', 'assets', 'keeper'], '\"To __ is human\"': ['ist', 'aah', 'eve', 'ity', 'sin', 'cpa', 'err', 'men', 'ism', 'all', 'art', 'ape', 'oid', 'soc', 'spy', 'lap', 'man', 'jon', 'arm']}"
	`1`	+"{'__ of bad news': ['ashame', 'bearer', 'better', 'decent', 'delays', 'edible', 'goodor', 'grimly', 'illuse', 'imsory', 'isgood', 'itssad', 'lesser', 'nohits', 'noroom', 'nosale', 'odious', 'oopsie', 'pileup', 'rancid', 'recall', 'rialto', 'rotten', 'severe', 'sitson', 'whoops', 'yerout'], 'Posture problem': ['allok', 'asnap', 'cando', 'issue', 'itsok', 'minds', 'poser', 'seems', 'sorry', 'stoop', 'tough', 'whats'], 'Loads': ['adored', 'adores', 'cargos', 'oceans', 'onuses', 'oodles', 'plenty', 'rearms', 'roadie', 'scores', 'washes'], 'Laundry appliance': ['drier', 'dryer'], 'Lectured': ['pupil', 'spoke'], 'One who weeps': ['acter', 'admit', 'again', 'agree', 'ahalf', 'ahead', 'akind', 'alien', 'alike', 'alone', 'along', 'among', 'arent', 'asoul', 'atime', 'bogey', 'crier', 'cries', 'cryer', 'cuber', 'digit', 'equal', 'every', 'excon', 'fiber', 'grade', 'groan', 'group', 'icare', 'idiom', 'iwant', 'longe', 'loser', 'lucky', 'model', 'motto', 'nonot', 'oneby', 'oneto', 'orthe', 'ortwo', 'oweme', 'owner', 'piece', 'puzle', 'rewed', 'riser', 'satan', 'shirt', 'sided', 'sixof', 'sroot', 'tento', 'their', 'these', 'those', 'title', 'trust', 'tryit', 'twoto', 'unite', 'units', 'vowel', 'wedto', 'which', 'whine'], 'Grassy clump': ['sod', 'wad'], 'Pie chart portion': ['sector'], '\"Scary Movie,\" e.g.': ['parody', 'promos', 'rental', 'scarer', 'scream', 'sequel', 'teaser', 'weeper'], \"Maryland's state bird\": ['grouse', 'oriole', 'thrush'], 'Something worth saving': ['assets', 'keeper', 'usable'], '\"To __ is human\"': ['aah', 'all', 'ape', 'arm', 'art', 'cpa', 'err', 'eve', 'ism', 'ist', 'ity', 'jon', 'lap', 'man', 'men', 'oid', 'sin', 'soc', 'spy']}"