w2v.py

'''
An attempted program to perform word2vec in Python3 with the help of gensim

Step 1 : Fetch list of files from the directory given / If empty look for a directory named data
Step 2 : Look for CSV files and in those files look for column name "content"
Step 3 : Parse Article to Sentences
Step 4 : Split Sentences into words where phases are taken care of also make entire data lowercase
Step 5 : Train Word2Vec with as required params
Step 6 : Save embeddings
Step 7 : Reduce the number of dimensions
Step 8 : Plot the Graph of the vector we got after Dimensionality reduction
Step 9 : Perform the operation you need.
'''

import codecs
import glob
import logging
import os
import pprint as pp
import re
import nltk
from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from multiprocessing import Lock, Process, Pool
import multiprocessing
from nltk.corpus import stopwords
from itertools import islice

lock = Lock()
phrases = Phrases(min_count=5, threshold=10, delimiter=b'_', progress_per=5000)
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
cachedStopWords = stopwords.words("english")
phraser = ""
article2vec = Word2Vec()

class W2V:
    NUMBER_OF_PROCESSOR = 8
    original_location = ""
    tokenizer = ""
    sentences = []
    raw_sentences = []
    corpus_raw = u""
    pool = ""
    phrases = ""

    # Main function
    def __init__(self, location):
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        self.read_files(location)

    # Function which reads file in a given directory and returns a list of csv files
    def read_files(self, location):
        self.original_location = os.curdir
        os.chdir("./" + location)
        list_of_files = glob.glob("*.csv")

        # for file in list_of_files:
        print(list_of_files)
        for fi in list_of_files:
            self.fetch_data_from_file(fi)

    # Function to fetch strings content column of all CSV and put all the articles together
    def fetch_data_from_file(self, file_name):
        print("Current file being processed is " + file_name)
        df = pd.read_csv(file_name)
        list_of_article = df['content']
        print("Length : " + str(len(list_of_article)))
        lol = self.lol(list_of_article, self.NUMBER_OF_PROCESSOR)

        self.pool = Pool(self.NUMBER_OF_PROCESSOR)
        # Get list of String of text content
        chunk_corpus_list = list(self.pool.map(self.add_content_to_corpus, lol))
        for chunks in chunk_corpus_list:
            self.corpus_raw += chunks

        self.corpus_raw = self.corpus_raw.encode("utf-8")
        print("String length of corpus : " + str(len(self.corpus_raw)))

        # Generate sentences from the string generated by worker pool before
        all_sentences = self.pool.map(self.token_maker, chunk_corpus_list)
        for i in all_sentences:
            for j in i:
                self.raw_sentences.append(j)

        print(self.raw_sentences[0])
        print("Number of Sentences are : " + str(len(self.raw_sentences)))

        self.pool.map(self.phrases_generation, all_sentences)

        global phraser
        phraser = Phraser(phrases)

        pool_of_words = self.pool.map(self.word_sentence, all_sentences)
        for i in pool_of_words:
            for j in i:
                self.sentences.append(j)

        # Dimensionality of the resulting word vectors.
        # more dimensions, more computationally expensive to train
        # but also more accurate
        # more dimensions = more generalized
        num_features = 300
        # Minimum word count threshold.
        min_word_count = 3

        # Number of threads to run in parallel.
        # more workers, faster we train
        num_workers = multiprocessing.cpu_count()

        # Context window length.
        context_size = 7

        # Down-sample setting for frequent words.
        # 0 - 1e-5 is good for this
        downsampling = 1e-3

        # Seed for the RNG, to make the results reproducible.
        # random number generator
        # deterministic, good for debugging
        seed = 1

        global article2vec

        article2vec = Word2Vec(
            sg=1,
            seed=seed,
            workers=num_workers,
            size=num_features,
            min_count=min_word_count,
            window=context_size,
            sample=downsampling
        )

        article2vec.init_sims(replace=True)
        article2vec.build_vocab(self.sentences)
        print("Words similar to Trump : ")
        print(article2vec.wv.most_similar("trump", topn=15))
        print("Words similar to Obama : ")
        print(article2vec.wv.most_similar("obama", topn=15))
        # all_word_vectors_matrix = article2vec.wv.vectors
        # print(article2vec.wv.vectors[0])
        # list_of_emb = self.chunk(article2vec.wv.vectors, self.NUMBER_OF_PROCESSOR)
        # df_list = self.pool.map(self.dimention_reduction, list_of_emb)
        # points = pd.DataFrame(columns=["word", "x", "y"])
        # for item in df_list:
        #     points.append(item)
        #
        # print(points.head(10))
        # sns.set_context("poster")
        # points.plot.scatter("x", "y", s=10, figsize=(20, 12))

        self.pool.close()
        self.pool.terminate()

        print("Word2Vec vocabulary length:", len(article2vec.wv.vocab))

    # Makes n small lists out of one big list and return list of small lists
    @staticmethod
    def lol(list_of_article, size):
        return list(list_of_article[i::size] for i in range(size))

    # Combines all the words in a given list of articles and returns a huge string
    @staticmethod
    def add_content_to_corpus(list_of_article):
        string_of_article = u""
        for article in list_of_article:
            curr = ' '.join([word for word in article.split() if word not in cachedStopWords]).lower()
            curr = re.sub(r'\b\w{1,1}\b', '', curr)
            string_of_article += curr.replace("‘", '').replace("’", '').replace("'", '')

        return string_of_article

    @staticmethod
    def token_maker(data):
        return nltk.data.load('tokenizers/punkt/english.pickle').tokenize(data)

    @staticmethod
    def phrases_generation(data):
        data_new = []
        for i in data:
            data_new.append(sentences_to_words(i))
        phrases.add_vocab(data_new)

    @staticmethod
    def word_sentence(data):
        global phraser
        ret = []
        for items in data:
            if len(items) > 0:
                ret.append(phrases[sentences_to_words(items)])

        return ret

    @staticmethod
    def chunk(it, size):
        it = iter(it)
        return list(iter(lambda: list(islice(it, size)), []))

    @staticmethod
    def dimention_reduction(all_word_vectors_matrix):
        # TODO: Use PCA or LDA first to narrow down the dimension and then go with TSNE?
        all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
        points = pd.DataFrame(
            [
                (word, coords[0], coords[1])
                for word, coords in [(word, all_word_vectors_matrix_2d[article2vec.wv.vocab[word].index])
                                     for word in article2vec.wv.vocab]
            ],
            columns=["word", "x", "y"]
        )
        return points


def sentences_to_words(sentence):
    clean = re.sub("[^a-zA-Z]", " ", sentence)
    words = clean.split()
    return words


W2V("all-the-news")