Skip to content

Commit 250c8b1

Browse files
authored
Add files via upload
1 parent cdeaf71 commit 250c8b1

File tree

4 files changed

+14593
-0
lines changed

4 files changed

+14593
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# clean data for tfidf, doc2vec and countvectorizer
2+
3+
import os
4+
import re
5+
import csv
6+
import spacy
7+
import string
8+
import pandas as pd
9+
10+
11+
en = spacy.load('en_core_web_sm')
12+
sw_spacy = en.Defaults.stop_words
13+
load_model = spacy.load('en', disable=["tok2vec", "tagger", "parser", "attribute_ruler", "ner"])
14+
15+
emoji_pattern = re.compile("["
16+
u"\U0001F600-\U0001F64F" # emoticons
17+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
18+
u"\U0001F680-\U0001F6FF" # transport & map symbols
19+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
20+
u"\U00002702-\U000027B0"
21+
u"\U000024C2-\U0001F251"
22+
"]+", flags=re.UNICODE)
23+
24+
def text_preprocess(x):
25+
x = x.lower() # lowercase
26+
x = ' '.join([word for word in x.split(' ') if word not in sw_spacy]) # stopwords
27+
x = x.encode('ascii', 'ignore').decode() # unicode
28+
x = re.sub(r'https*\S+', ' ', x) # url
29+
x = re.sub(r'@\S+', ' ', x) # mentions
30+
x = re.sub(r'#\S+', ' ', x) # hastags
31+
x = x.replace("'", "") # remove ticks
32+
x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x) # punctuation
33+
x = re.sub(r'\w*\d+\w*', '', x) # numbers
34+
x = re.sub(r'\s{2,}', ' ', x) # over spaces
35+
x = emoji_pattern.sub(r'', x) # emojis
36+
x = re.sub('[^A-Za-z0-9]+', ' ', x) # special charachters
37+
x = load_model(x)
38+
x = " ".join([token.lemma_ for token in x])
39+
40+
return x
41+
42+
def clean_data(df, columns_to_clean):
43+
for col in columns_to_clean:
44+
df[f"cleaned_{col}"] = df[col].progress_apply(text_preprocess)
45+
46+
df = df.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'], axis=1)
47+
return df
48+
49+
50+
51+
if __name__ == "__main__":
52+
53+
BASE_DIR = "/content/input"
54+
55+
train_path = os.path.join(BASE_DIR, "train.csv")
56+
test_path = os.path.join(BASE_DIR, "train.csv")
57+
sample_submission_path = os.path.join(BASE_DIR, "sample_submission.csv")
58+
59+
train = pd.read_csv(train_path, escapechar="\\", quoting=csv.QUOTE_NONE)
60+
train_na_free = train.fillna(value="NaN")
61+
train_cleaned = clean_data(train_na_free, ['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'])
62+
63+
# save cleaned train file
64+
train_cleaned.to_csv("cleaned_train.csv", index=False)
65+
66+
test = pd.read_csv(test_path, escapechar="\\", quoting=csv.QUOTE_NONE)
67+
test_na_free = test.fillna(value="NaN")
68+
test_cleaned = clean_data(test_na_free, ['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'])
69+
70+
# save cleaned test file
71+
train_cleaned.to_csv("cleaned_test.csv", index=False)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# clean data for bert (sentence embeddings)
2+
3+
#pip install -q transformers
4+
#pip install -q sentence-transformers
5+
#pip install -q pytorch-lightning
6+
import csv
7+
import pickle
8+
import numpy as np
9+
import pandas as pd
10+
from tqdm.auto import tqdm
11+
from sentence_transformers import SentenceTransformer
12+
13+
def pre_process(corpus1, corpus2, corpus3, corpus4):
14+
corpus = list()
15+
for i in tqdm(range(len(corpus1))):
16+
# Get title
17+
if len(corpus1[i]) > 0:
18+
title = corpus1[i]
19+
else:
20+
title = ""
21+
# Get product description
22+
if len(corpus2[i]) > 0:
23+
desc = corpus2[i]
24+
else:
25+
if len(corpus4[i]) > 0:
26+
desc = corpus4[i]
27+
else:
28+
desc = ""
29+
# Get product brand
30+
if len(corpus3[i]) > 0:
31+
brand = corpus3[i]
32+
else:
33+
brand = ""
34+
35+
if len(title) < 1 and len(desc) < 1 and len(brand) < 1:
36+
sample = "NAN"
37+
else:
38+
sample = title + " " + desc + " " + brand
39+
corpus.append(sample)
40+
return corpus
41+
42+
def create_corpus(df):
43+
corpus1 = df["cleaned_TITLE"].values
44+
corpus2 = df["cleaned_DESCRIPTION"].values
45+
corpus3 = df["cleaned_BRAND"].values
46+
corpus4 = df["cleaned_BULLET_POINTS"].values
47+
48+
corpus = pre_process(corpus1, corpus2, corpus3, corpus4)
49+
50+
return corpus
51+
52+
if __name__ == "__main__":
53+
54+
train_file_path = "/content/input/new_cleaned/new_cleaned_train_bert.csv"
55+
test_file_path = "/content/input/new_cleaned/new_cleaned_test_bert.csv"
56+
57+
58+
train_df = pd.read_csv(train_file_path, escapechar="\\", quoting=csv.QUOTE_NONE)
59+
test_df = pd.read_csv(test_file_path, escapechar="\\", quoting=csv.QUOTE_NONE)
60+
61+
train_df = train_df.replace(np.nan, '', regex=True)
62+
test_df = test_df.replace(np.nan, '', regex=True)
63+
64+
train_corpus = create_corpus(train_df)
65+
del train_df
66+
67+
test_corpus = create_corpus(test_df)
68+
del test_df
69+
70+
# create SentenceTransformer
71+
model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2', device='cuda:0')
72+
73+
train_embeds = model.encode(train_corpus, batch_size = 1024, device='cuda:0', show_progress_bar=True)
74+
del train_corpus
75+
76+
test_embeds = model.encode(test_corpus, batch_size = 1024, device='cuda:0', show_progress_bar=True)
77+
del test_corpus
78+
79+
# save embeddings
80+
with open('sm_train.pkl', "wb") as f:
81+
pickle.dump({'embeddings': train_embeds}, f, protocol=pickle.HIGHEST_PROTOCOL)
82+
83+
with open('sm_test.pkl', "wb") as f:
84+
pickle.dump({'embeddings': test_embeds}, f, protocol=pickle.HIGHEST_PROTOCOL)

0 commit comments

Comments
 (0)