-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b49b0b5
commit a62ec9f
Showing
4 changed files
with
292 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
### Without Dropout layer | ||
<table> | ||
<tr> | ||
<td>Featue Vector Dimension</td> | ||
<td>Train Loss</td> | ||
<td>Train Accuracy</td> | ||
<td>Test Loss</td> | ||
<td>Test Accuracy</td> | ||
<td>Inference Time</td> | ||
</tr> | ||
<tr> | ||
<td>50d</td> | ||
<td>0.3673</td> | ||
<td>0.9394</td> | ||
<td>0.4503</td> | ||
<td>0.8571</td> | ||
<td>0.0686s</td> | ||
</tr> | ||
</tr> | ||
<td>100d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
<tr> | ||
<td>200d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
<tr> | ||
<td>300d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
|
||
</table> | ||
|
||
<br> | ||
|
||
### With Dropout layer | ||
<table> | ||
<tr> | ||
<td>Featue Vector Dimension</td> | ||
<td>Train Loss</td> | ||
<td>Train Accuracy</td> | ||
<td>Test Loss</td> | ||
<td>Test Accuracy</td> | ||
<td>Inference Time</td> | ||
</tr> | ||
<tr> | ||
<td>50d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
</tr> | ||
<td>100d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
<tr> | ||
<td>200d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
<tr> | ||
<td>300d</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...</td> | ||
<td>...s</td> | ||
</tr> | ||
|
||
</table> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from time import time | ||
from argparse import ArgumentParser, BooleanOptionalAction | ||
|
||
from text_classifier import EmojiTextClassifier | ||
from tqdm import tqdm | ||
|
||
args = ArgumentParser() | ||
args.add_argument('--model', type=str, | ||
required=True, help='The model path') | ||
args.add_argument('--vectors-file', type=str, | ||
default='./glove_6B/glove.6B.200d.txt', help='The feature vectors file path') | ||
args.add_argument('--sentence', type=str, | ||
required=True, help='The sentence to test the model') | ||
args.add_argument('--infer', type=bool, default=True, | ||
action=BooleanOptionalAction, help='Whether to inferences the model with your sentence or not') | ||
args.add_argument('--n-infer', type=int, | ||
default=100, help='Number of inferences on your sentence') | ||
|
||
opt = args.parse_args() | ||
classifier = EmojiTextClassifier(...) | ||
classifier.load_feature_vectors(opt.vectors_file) | ||
classifier.load_model(opt.model) | ||
emoji = classifier.predict(opt.sentence) | ||
print(emoji) | ||
|
||
if opt.infer: | ||
start_time = time() | ||
for i in tqdm(range(opt.n_infer)): | ||
classifier.predict(opt.sentence) | ||
|
||
duration = time() - start_time | ||
print(f'\nAverage inference time: {duration / opt.n_infer}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from keras.models import Sequential, load_model | ||
from keras import layers | ||
from keras.activations import softmax | ||
from keras.losses import categorical_crossentropy | ||
from keras.optimizers import Adam | ||
from keras.callbacks import History, ModelCheckpoint | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
class EmojiTextClassifier: | ||
def __init__(self, dimension: int) -> None: | ||
self.dimension = dimension | ||
self.model = None | ||
self.words_vectors: dict[str, np.ndarray] = {} | ||
|
||
@staticmethod | ||
def load_dataset(dataset_path: str) -> list[np.ndarray, np.ndarray]: | ||
df = pd.read_csv(dataset_path) | ||
x = np.array(df['sentence']) | ||
y = np.array(df['label'], dtype=int) | ||
|
||
return x, y | ||
|
||
def load_feature_vectors(self, file_path: str) -> None: | ||
line: str | ||
|
||
file = open(file_path, encoding='utf-8') | ||
for line in file: | ||
line = line.strip().split(' ') | ||
word = line[0] | ||
vector = np.array(line[1:], dtype=np.float64) | ||
self.words_vectors[word] = vector | ||
|
||
def sentence_to_feature_vectors_avg(self, sentence: str) -> np.ndarray[np.floating] | None: | ||
sentence = sentence.lower() | ||
words = sentence.strip().split(' ') | ||
sum_vectors = np.zeros((50, )) | ||
|
||
try: | ||
for word in words: | ||
vector = self.words_vectors[word] | ||
sum_vectors += vector | ||
return sum_vectors / len(words) | ||
except KeyError: | ||
print(f'There is an unknown word in this sentence: "{sentence}"') | ||
|
||
def convert_sentences_to_vectors(self, sentences: np.ndarray) -> np.ndarray: | ||
sentences_avg = [] | ||
for sentence in sentences: | ||
sentences_avg.append( | ||
self.sentence_to_feature_vectors_avg(sentence) | ||
) | ||
|
||
return np.array(sentences_avg) | ||
|
||
def build_model(self, with_dropout: bool = False) -> None: | ||
if with_dropout: | ||
self.model = Sequential([ | ||
layers.Dropout(.5, name='DropoutLayer'), | ||
layers.Dense(5, activation=softmax, | ||
input_shape=(self.dimension, ), name='OutputLayer') | ||
]) | ||
|
||
else: | ||
self.model = Sequential([ | ||
layers.Dense(5, activation=softmax, | ||
input_shape=(self.dimension, ), name='OutputLayer') | ||
]) | ||
|
||
def compile_model(self, optimizer=Adam(), loss=categorical_crossentropy) -> None: | ||
self.model.compile( | ||
optimizer=optimizer, | ||
loss=loss, | ||
metrics=['accuracy'] | ||
) | ||
|
||
def train_model(self, x_train, y_train, epochs: int, model_path_to_save: str = 'best_emojis_classifier.keras') -> History: | ||
check = ModelCheckpoint(model_path_to_save, | ||
monitor='accuracy', save_best_only=True) | ||
|
||
history: History = self.model.fit( | ||
x_train, | ||
y_train, | ||
epochs=epochs, | ||
callbacks=[check] | ||
) | ||
|
||
return history | ||
|
||
@staticmethod | ||
def evaluate(model_path, x_test, y_test) -> None: | ||
model: Sequential = load_model(model_path) | ||
print('\nEvaluating model...') | ||
model.evaluate(x_test, y_test) | ||
|
||
def load_model(self, model_path: str) -> None: | ||
self.model: Sequential = load_model(model_path) | ||
|
||
def predict(self, sentence: str) -> str: | ||
sentence_avg = self.sentence_to_feature_vectors_avg(sentence) | ||
sentence_avg = np.array([sentence_avg]) | ||
|
||
prediction = self.model.predict(sentence_avg) | ||
y_hat = np.argmax(prediction) | ||
|
||
return EmojiTextClassifier.covert_label_to_emoji(y_hat) | ||
|
||
@staticmethod | ||
def covert_label_to_emoji(label: int) -> str: | ||
emojis = ['🧡', '⚾', '😃', '😔', '🍴'] | ||
return emojis[label] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from argparse import ArgumentParser, BooleanOptionalAction | ||
|
||
from keras.callbacks import History | ||
from keras.utils import to_categorical | ||
import matplotlib.pyplot as plt | ||
from text_classifier import EmojiTextClassifier | ||
|
||
args = ArgumentParser() | ||
args.add_argument('--train-dataset', type=str, | ||
default='./dataset/train.csv', help='The train dataset path') | ||
args.add_argument('--test-dataset', type=str, | ||
default='./dataset/test.csv', help='The test dataset path') | ||
args.add_argument('--dimension', type=int, | ||
default=50, help='The dimension of feature vectors') | ||
args.add_argument('--vectors-file', type=str, | ||
default='./glove_6B/glove.6B.200d.txt', help='The feature vectors file path') | ||
args.add_argument('--dropout', type=bool, | ||
default=False, action=BooleanOptionalAction, help='Add dropout layer to network') | ||
args.add_argument('--model-save', type=str, | ||
default='best_emojis_classifier.keras', help='The best model path to save') | ||
args.add_argument('--epochs', type=int, | ||
default=200, help='The number of epochs to train the model') | ||
args.add_argument('--save-plots', type=bool, | ||
default=True, action=BooleanOptionalAction, help='Save the training information plots') | ||
|
||
opt = args.parse_args() | ||
|
||
classifier = EmojiTextClassifier(opt.dimension) | ||
x_train, y_train = EmojiTextClassifier.load_dataset(opt.train_dataset) | ||
x_test, y_test = EmojiTextClassifier.load_dataset(opt.test_dataset) | ||
|
||
classifier.load_feature_vectors(opt.vectors_file) | ||
x_train = classifier.convert_sentences_to_vectors(x_train) | ||
x_test = classifier.convert_sentences_to_vectors(x_test) | ||
|
||
y_train = to_categorical(y_train) | ||
y_test = to_categorical(y_test) | ||
|
||
classifier.build_model(opt.dropout) | ||
classifier.compile_model() | ||
history: History = classifier.train_model( | ||
x_train, y_train, opt.epochs, opt.model_save) | ||
|
||
EmojiTextClassifier.evaluate(opt.model_save, x_test, y_test) | ||
|
||
if opt.save_plots: | ||
fig, (ax1, ax2) = plt.subplots(1, 2)[1] | ||
ax1.plot(history.history['accuracy']) | ||
ax1.set_xlabel('Epochs') | ||
ax1.set_ylabel('Accuracy') | ||
|
||
ax2.plot(history.history['loss']) | ||
ax2.set_xlabel('Epochs') | ||
ax2.set_ylabel('Loss') | ||
fig.suptitle(f'Emojis Classification With Dropout: {False}') | ||
plt.savefig(f'Emojis_Classification_{opt.epochs}ep_dropout_{opt.dropout}.png') |