workshop.py

# -*- coding: utf-8 -*-
"""workshop.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1VMRXr0Iel0gsbM6Aivw9HjvGZNwKkh9V
"""

# Commented out IPython magic to ensure Python compatibility.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# %matplotlib inline
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
workshops = pd.read_csv('https://drive.google.com/uc?id=10MngpIZoAGgwAk_sxoORj7WPYs74nz5Y').set_index('event')
workshops.head()
done = workshops

workshops['id'] = range(0, 0+len(workshops))

ws = workshops.assign(tags = workshops.tags.str.split(",")).explode('tags')
print(ws.tags)

#ws.head(10)

# download required packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('gutenberg')


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import preprocessing
from sklearn.metrics import pairwise_distances

from sklearn.preprocessing import LabelEncoder
stops = stopwords.words()
ws = workshops.assign(tags = workshops.tags.str.split(",")).explode('tags')
ws.head()
ws['workshop'] = [t for t in ws.workshop if t not in stops]
ws['workshop'] = [t.lower() for t in ws.workshop]
ws.head()
le = preprocessing.LabelEncoder()
le.fit(ws.tags)
list(le.classes_)
ws['tags'] = le.transform(ws.tags)
ws.head()
X = ws.workshop.values
y = ws.tags.values
# referenced 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, X_cv.shape

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_cv = tfidf.transform(X_cv)
X_test = tfidf.transform(X_test)
X_train.shape, X_cv.shape, X_test.shape
from sklearn.linear_model import SGDClassifier

from sklearn.metrics.classification import accuracy_score
score_train = []
score_cv = []
best_alpha = 1e-7
from sklearn.metrics.classification import confusion_matrix
import seaborn as sns


import re
#data = tfidf.fit_transform(ws.workshop)
#medium article referenced for preprocessing (https://medium.com/analytics-vidhya/search-engine-using-machine-learning-and-nlp-c1ec1e28be7a)
vectorizer = CountVectorizer()
data = vectorizer.fit_transform(ws['workshop'])
clf_final = SGDClassifier(alpha = 1e-7, loss = "log", class_weight="balanced", n_jobs=-1)
clf_final.fit(data, y)

def process_query(query):
    preprocessed_reviews = []
    sentance = re.sub("\S*\d\S*", "", query).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords.words('english'))
    preprocessed_reviews.append(sentance.strip())
    return preprocessed_reviews

#def tfidf_search(query):
 #   query = process_query(query)
  #  query_trans = tfidf.transform(query)
   # pairwise_dist = pairwise_distances(ws, query_trans)
    
   # indices = np.argsort(pairwise_dist.flatten())[0:10]
   #df_indices = list(processed.index[indices])
    #return df_indices


def label(query):
    query = process_query(query)
    query = tfidf.transform(query)
    ans = clf_final.predict(query)
    return le.inverse_transform([ans[0]])

# Jus
def outputindex(query):
    output = label(query)
    output = np.array2string(output)
    fin = done.assign(tags = done.tags.str.split(",")).explode('tags')
    fin = done.assign(tags = done.tags.str.split(" ")).explode('tags')
    fin['tags'] = fin['tags'].apply(str)
    import re
    word = "".join(re.findall("[a-zA-Z]+", output))
    word = word.strip()
    word = word.split()[0]
    tagrem = fin[fin['tags'].str.contains(word)] # how to check if array has value?? seems to subset everything
    tagrem = tagrem['workshop']
    #tagrem.drop_duplicates()
    tagrem = tagrem.to_list()
    return tagrem


## allow for user to enter here, only thing that needs to be changed is query = (might be unique to torchlit)
def enter_queries(query) : 
    print("You want to learn about:", query)
    finalmat = outputindex(query)
    
    output = label(query)
    output = np.array2string(output)
    word2 = " ".join(re.findall("[a-zA-Z]+", output))
    word2 = word2.strip()

    print("Our ML algorithm found this is commonly tagged with:",word2)
    print("Check this out! : ")
    #print("", finalmat)
    return(finalmat)