-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathworkshop.py
154 lines (126 loc) · 4.82 KB
/
workshop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding: utf-8 -*-
"""workshop.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VMRXr0Iel0gsbM6Aivw9HjvGZNwKkh9V
"""
# Commented out IPython magic to ensure Python compatibility.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
workshops = pd.read_csv('https://drive.google.com/uc?id=10MngpIZoAGgwAk_sxoORj7WPYs74nz5Y').set_index('event')
workshops.head()
done = workshops
workshops['id'] = range(0, 0+len(workshops))
ws = workshops.assign(tags = workshops.tags.str.split(",")).explode('tags')
print(ws.tags)
#ws.head(10)
# download required packages
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('gutenberg')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder
stops = stopwords.words()
ws = workshops.assign(tags = workshops.tags.str.split(",")).explode('tags')
ws.head()
ws['workshop'] = [t for t in ws.workshop if t not in stops]
ws['workshop'] = [t.lower() for t in ws.workshop]
ws.head()
le = preprocessing.LabelEncoder()
le.fit(ws.tags)
list(le.classes_)
ws['tags'] = le.transform(ws.tags)
ws.head()
X = ws.workshop.values
y = ws.tags.values
# referenced
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, X_cv.shape
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_cv = tfidf.transform(X_cv)
X_test = tfidf.transform(X_test)
X_train.shape, X_cv.shape, X_test.shape
from sklearn.linear_model import SGDClassifier
from sklearn.metrics.classification import accuracy_score
score_train = []
score_cv = []
best_alpha = 1e-7
from sklearn.metrics.classification import confusion_matrix
import seaborn as sns
import re
#data = tfidf.fit_transform(ws.workshop)
#medium article referenced for preprocessing (https://medium.com/analytics-vidhya/search-engine-using-machine-learning-and-nlp-c1ec1e28be7a)
vectorizer = CountVectorizer()
data = vectorizer.fit_transform(ws['workshop'])
clf_final = SGDClassifier(alpha = 1e-7, loss = "log", class_weight="balanced", n_jobs=-1)
clf_final.fit(data, y)
def process_query(query):
preprocessed_reviews = []
sentance = re.sub("\S*\d\S*", "", query).strip()
sentance = re.sub('[^A-Za-z]+', ' ', sentance)
sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords.words('english'))
preprocessed_reviews.append(sentance.strip())
return preprocessed_reviews
#def tfidf_search(query):
# query = process_query(query)
# query_trans = tfidf.transform(query)
# pairwise_dist = pairwise_distances(ws, query_trans)
# indices = np.argsort(pairwise_dist.flatten())[0:10]
#df_indices = list(processed.index[indices])
#return df_indices
def label(query):
query = process_query(query)
query = tfidf.transform(query)
ans = clf_final.predict(query)
return le.inverse_transform([ans[0]])
# Jus
def outputindex(query):
output = label(query)
output = np.array2string(output)
fin = done.assign(tags = done.tags.str.split(",")).explode('tags')
fin = done.assign(tags = done.tags.str.split(" ")).explode('tags')
fin['tags'] = fin['tags'].apply(str)
import re
word = "".join(re.findall("[a-zA-Z]+", output))
word = word.strip()
word = word.split()[0]
tagrem = fin[fin['tags'].str.contains(word)] # how to check if array has value?? seems to subset everything
tagrem = tagrem['workshop']
#tagrem.drop_duplicates()
tagrem = tagrem.to_list()
return tagrem
## allow for user to enter here, only thing that needs to be changed is query = (might be unique to torchlit)
def enter_queries(query) :
print("You want to learn about:", query)
finalmat = outputindex(query)
output = label(query)
output = np.array2string(output)
word2 = " ".join(re.findall("[a-zA-Z]+", output))
word2 = word2.strip()
print("Our ML algorithm found this is commonly tagged with:",word2)
print("Check this out! : ")
#print("", finalmat)
return(finalmat)