Skip to content

Commit c16e771

Browse files
committed
Adding files
0 parents  commit c16e771

12 files changed

+110087
-0
lines changed

analysis.py

+235
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
import os
2+
import sys
3+
import nltk
4+
import json
5+
import re
6+
import pandas as pd
7+
import matplotlib.pyplot as plt
8+
import numpy as np
9+
import math
10+
import yfinance as yf
11+
12+
from nltk.stem.wordnet import WordNetLemmatizer
13+
14+
from sklearn.feature_extraction.text import CountVectorizer
15+
from sklearn.feature_extraction.text import TfidfVectorizer
16+
from sklearn.metrics import jaccard_score
17+
from sklearn.metrics.pairwise import cosine_similarity as cs
18+
19+
20+
def main():
21+
sentiment_scores_test('fb', 'tokenized_words.json')
22+
pass
23+
24+
25+
def sentiment_scores_test(ticker, input_filename):
26+
sentiments = pd.read_csv('supporting_data/sentiment_dataframe.csv')
27+
28+
with open(input_filename, 'r') as f:
29+
tokens = json.loads(f.read())
30+
31+
documents = reformat_documents(ticker, tokens)
32+
sbow = sentiment_bag_of_words(documents, sentiments)
33+
scores = sentiment_scores(sbow)
34+
print(scores)
35+
plot_sentiment_scores(ticker, scores)
36+
37+
38+
def sentiment_scores(sbow):
39+
scores = {}
40+
41+
for year in sbow['positive']:
42+
scores[year] = 0
43+
scores[year] += sum(sbow['positive'][year]) / len(sbow['positive'][year])
44+
45+
for year in sbow['superfluous']:
46+
scores[year] += sum(sbow['superfluous'][year]) / len(sbow['superfluous'][year])
47+
48+
for year in sbow['interesting']:
49+
scores[year] += sum(sbow['interesting'][year]) / len(sbow['interesting'][year])
50+
51+
for year in sbow['negative']:
52+
scores[year] -= sum(sbow['negative'][year]) / len(sbow['negative'][year])
53+
54+
for year in sbow['litigious']:
55+
scores[year] -= sum(sbow['litigious'][year]) / len(sbow['litigious'][year])
56+
57+
for year in sbow['uncertainty']:
58+
scores[year] -= sum(sbow['uncertainty'][year]) / len(sbow['uncertainty'][year])
59+
60+
for year in sbow['constraining']:
61+
scores[year] -= sum(sbow['constraining'][year]) / len(sbow['constraining'][year])
62+
63+
for item in scores:
64+
scores[item] = math.tanh(scores[item])
65+
66+
return dict(sorted(scores.items()))
67+
68+
69+
def plot_sentiment_scores(ticker, scores):
70+
m = min(scores.keys())
71+
M = max(scores.keys())
72+
73+
df = yf.download(ticker, start = f'20{m}-01-01', end = f'20{M}-03-01', interval = '3mo')
74+
df.to_csv(f'price_time_series/{ticker}.csv')
75+
df = pd.read_csv(f'price_time_series/{ticker}.csv')
76+
price = {}
77+
78+
for year in scores.keys():
79+
price[year] = 0
80+
81+
for index, row in df.iterrows():
82+
if row['Date'][2:4] == year and row['High'] > price[year]:
83+
price[year] = row['High']
84+
85+
try:
86+
price.pop('99')
87+
scores.pop('99')
88+
except:
89+
pass
90+
91+
figure, (ax1, ax2) = plt.subplots(2)#, sharex = True
92+
93+
ax1.plot(price.keys(), price.values())
94+
ax1.set(title = f'{ticker}', xlabel = 'years', ylabel = 'stock price')
95+
ax2.plot(scores.keys(), scores.values())
96+
ax2.set(xlabel = 'years', ylabel = 'sentiment score')
97+
98+
#plt.savefig('sample', dpi=300)
99+
100+
plt.show()
101+
102+
103+
def similarity_test(input_filename, ticker):
104+
105+
sentiments = pd.read_csv('supporting_data/sentiment_dataframe.csv')
106+
107+
with open(input_filename, 'r') as f:
108+
tokens = json.loads(f.read())
109+
110+
documents = reformat_documents(ticker, tokens)
111+
112+
sbow = sentiment_bag_of_words(documents, sentiments)
113+
114+
similarities = jaccard_similarity(sbow)
115+
116+
with open('supporting_data/jaccard_similarities.json', 'w') as f:
117+
json.dump(similarities, f)
118+
119+
stfidf = sentiment_tfidf(documents, sentiments)
120+
121+
similarities = cosine_similarity(stfidf)
122+
123+
with open('supporting_data/cosine_similarities.json', 'w') as f:
124+
json.dump(similarities, f)
125+
126+
127+
def create_sentiment_dataframe():
128+
129+
df = pd.read_csv('supporting_data/LoughranMcDonald_MasterDictionary_2018.csv')
130+
131+
# Set column names and words to lower case
132+
df.columns = df.columns.str.lower()
133+
df['word'] = [str(word).lower() for word in df['word']]
134+
135+
# Select sentiment word and word columns
136+
sentiment_words = list(df.columns[7:14])
137+
df = df[['word'] + sentiment_words]
138+
139+
# Remove words with 0 occurences
140+
df[sentiment_words] = df[sentiment_words].astype(bool)
141+
df = df[(df[sentiment_words]).any(1)]
142+
143+
# Stem words and remove duplicates
144+
wnl = WordNetLemmatizer()
145+
#df['word'] = WordNetLemmatizer().lemmatize(df['word'])
146+
df['word'] = [wnl.lemmatize(str(word)) for word in df['word']]
147+
df = df.drop_duplicates('word')
148+
149+
return df
150+
151+
152+
def reformat_documents(ticker, tokens):
153+
documents = {}
154+
155+
for year in tokens[ticker]:
156+
documents[year] = ' '.join([item for sublist in tokens[ticker][year] for item in sublist])
157+
158+
return documents
159+
160+
161+
# Analysis
162+
def sentiment_bag_of_words(documents, sentiments):
163+
sentiment_words = list(sentiments.columns[2:9])
164+
sbow = {}
165+
166+
for word in sentiment_words:
167+
sbow[word] = {}
168+
vectorizer = CountVectorizer(vocabulary = sentiments[sentiments[word]]['word'],
169+
analyzer = 'word',
170+
lowercase = False,
171+
dtype = np.int8)
172+
173+
model = vectorizer.fit(documents.values())
174+
175+
for year in documents.keys():
176+
sbow[word][year] = model.transform([documents[year]]).toarray()[0]
177+
178+
return sbow
179+
180+
181+
def sentiment_tfidf(documents, sentiments):
182+
sentiment_words = list(sentiments.columns[2:9])
183+
184+
stfidf = {}
185+
186+
for word in sentiment_words:
187+
stfidf[word] = {}
188+
vectorizer = TfidfVectorizer(vocabulary = sentiments[sentiments[word]]['word'],
189+
analyzer = 'word',
190+
lowercase = False,
191+
dtype = np.int8)
192+
193+
model = vectorizer.fit(documents.values())
194+
195+
for year in documents.keys():
196+
stfidf[word][year] = vectorizer.transform([documents[year]]).toarray()[0]
197+
198+
return stfidf
199+
200+
201+
def jaccard_similarity(sbow):
202+
203+
similarities = {}
204+
205+
for word in sbow:
206+
similarities[word] = {}
207+
208+
years = sorted(sbow[word].keys())
209+
for i in range(len(years)-1):
210+
x = sbow[word][years[i]].astype(bool)
211+
y = sbow[word][years[i + 1]].astype(bool)
212+
similarities[word][years[i]] = jaccard_score(x, y)
213+
214+
return similarities
215+
216+
217+
def cosine_similarity(stfidf):
218+
219+
similarities = {}
220+
221+
for word in stfidf:
222+
similarities[word] = {}
223+
224+
years = sorted(stfidf[word].keys())
225+
for i in range(len(years)-1):
226+
x = stfidf[word][years[i]].reshape(1, -1)
227+
y = stfidf[word][years[i+1]].reshape(1, -1)
228+
sim = cs(x, y)[0,0]
229+
similarities[word][years[i]] = cs(x, y)[0,0]
230+
231+
return similarities
232+
233+
234+
if __name__ == '__main__':
235+
main()

0 commit comments

Comments
 (0)