atif-hassan
diff --git a/‎Hackerearth/Amazon ML Challenge/code/clean_data.py
+71 b/‎Hackerearth/Amazon ML Challenge/code/clean_data.py
+71
diff --git a/‎Hackerearth/Amazon ML Challenge/code/sent_embeddings.py
+84 b/‎Hackerearth/Amazon ML Challenge/code/sent_embeddings.py
+84
@@ -0,0 +1,71 @@
+# clean data for tfidf, doc2vec and countvectorizer
+
+import os 
+import re
+import csv
+import spacy
+import string
+import pandas as pd
+
+
+en = spacy.load('en_core_web_sm')
+sw_spacy = en.Defaults.stop_words
+load_model = spacy.load('en', disable=["tok2vec", "tagger", "parser", "attribute_ruler", "ner"])
+
+emoji_pattern = re.compile("["
+                        u"\U0001F600-\U0001F64F"  # emoticons
+                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                        u"\U00002702-\U000027B0"
+                        u"\U000024C2-\U0001F251"
+                        "]+", flags=re.UNICODE)
+
+def text_preprocess(x):
+  x = x.lower()  # lowercase
+  x = ' '.join([word for word in x.split(' ') if word not in sw_spacy]) # stopwords
+  x = x.encode('ascii', 'ignore').decode()  # unicode
+  x = re.sub(r'https*\S+', ' ', x) # url
+  x = re.sub(r'@\S+', ' ', x)   # mentions
+  x = re.sub(r'#\S+', ' ', x)   # hastags
+  x = x.replace("'", "")    # remove ticks
+  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x) # punctuation
+  x = re.sub(r'\w*\d+\w*', '', x) # numbers
+  x = re.sub(r'\s{2,}', ' ', x) # over spaces
+  x = emoji_pattern.sub(r'', x) # emojis
+  x = re.sub('[^A-Za-z0-9]+', ' ', x) # special charachters
+  x = load_model(x)
+  x = " ".join([token.lemma_ for token in x])
+
+  return x
+
+def clean_data(df, columns_to_clean):
+    for col in columns_to_clean:
+        df[f"cleaned_{col}"] = df[col].progress_apply(text_preprocess)
+    
+    df = df.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'], axis=1)
+    return df
+
+
+
+if __name__ == "__main__":
+
+    BASE_DIR = "/content/input"
+
+    train_path = os.path.join(BASE_DIR, "train.csv")
+    test_path = os.path.join(BASE_DIR, "train.csv")
+    sample_submission_path = os.path.join(BASE_DIR, "sample_submission.csv")
+
+    train = pd.read_csv(train_path, escapechar="\\", quoting=csv.QUOTE_NONE)
+    train_na_free = train.fillna(value="NaN")
+    train_cleaned = clean_data(train_na_free, ['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'])
+    
+    # save cleaned train file
+    train_cleaned.to_csv("cleaned_train.csv", index=False)
+
+    test = pd.read_csv(test_path, escapechar="\\", quoting=csv.QUOTE_NONE)
+    test_na_free = test.fillna(value="NaN")
+    test_cleaned = clean_data(test_na_free, ['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'])
+    
+    # save cleaned test file
+    train_cleaned.to_csv("cleaned_test.csv", index=False)
@@ -0,0 +1,84 @@
+# clean data for bert (sentence embeddings)
+
+#pip install -q transformers
+#pip install -q sentence-transformers
+#pip install -q pytorch-lightning
+import csv
+import pickle
+import numpy as np
+import pandas as pd
+from tqdm.auto import tqdm
+from sentence_transformers import SentenceTransformer
+
+def pre_process(corpus1, corpus2, corpus3, corpus4):
+    corpus = list()
+    for i in tqdm(range(len(corpus1))):
+    	# Get title
+    	if len(corpus1[i]) > 0:
+    		title = corpus1[i]
+    	else:
+    		title = ""
+    	# Get product description
+    	if len(corpus2[i]) > 0:
+    		desc = corpus2[i]
+    	else:
+    		if len(corpus4[i]) > 0:
+    			desc = corpus4[i]
+    		else:
+    			desc = ""
+    	# Get product brand
+    	if len(corpus3[i]) > 0:
+    		brand = corpus3[i]
+    	else:
+    		brand = ""
+    	
+    	if len(title) < 1 and len(desc) < 1 and len(brand) < 1:
+    		sample = "NAN"
+    	else:
+    		sample = title + " " + desc + " " + brand
+    	corpus.append(sample)
+    return corpus
+
+def create_corpus(df):
+    corpus1 = df["cleaned_TITLE"].values
+    corpus2 = df["cleaned_DESCRIPTION"].values
+    corpus3 = df["cleaned_BRAND"].values
+    corpus4 = df["cleaned_BULLET_POINTS"].values
+
+    corpus = pre_process(corpus1, corpus2, corpus3, corpus4)
+
+    return corpus
+
+if __name__ == "__main__":
+
+    train_file_path = "/content/input/new_cleaned/new_cleaned_train_bert.csv"
+    test_file_path = "/content/input/new_cleaned/new_cleaned_test_bert.csv"
+
+
+    train_df = pd.read_csv(train_file_path, escapechar="\\", quoting=csv.QUOTE_NONE)
+    test_df = pd.read_csv(test_file_path, escapechar="\\", quoting=csv.QUOTE_NONE)
+
+    train_df = train_df.replace(np.nan, '', regex=True)
+    test_df = test_df.replace(np.nan, '', regex=True)
+
+    train_corpus = create_corpus(train_df)
+    del train_df
+
+    test_corpus = create_corpus(test_df)
+    del test_df
+
+    # create SentenceTransformer
+    model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2', device='cuda:0')
+
+    train_embeds = model.encode(train_corpus, batch_size = 1024, device='cuda:0', show_progress_bar=True)
+    del train_corpus
+    
+    test_embeds = model.encode(test_corpus, batch_size = 1024, device='cuda:0', show_progress_bar=True)
+    del test_corpus
+
+    # save embeddings
+    with open('sm_train.pkl', "wb") as f:
+        pickle.dump({'embeddings': train_embeds}, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+    with open('sm_test.pkl', "wb") as f:
+        pickle.dump({'embeddings': test_embeds}, f, protocol=pickle.HIGHEST_PROTOCOL)