-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
executable file
·110 lines (93 loc) · 3.89 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
"""Train and predict."""
from os.path import exists
from pathlib import Path
import pandas as pd
from joblib import dump, load
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale
STORAGE_DIR = f'{Path.home()}/code/predictor/storage'
PREDICTIONS = f'{STORAGE_DIR}/predictions.pkl.gz'
LOAD_MODEL = False
PRINT_ONLY = False
def load_data():
"""Load hackernews and stocks data."""
hackernews_df = pd.read_pickle(f'{STORAGE_DIR}/hackernews.pkl.gz')
stocks_df = pd.read_pickle(f'{STORAGE_DIR}/stocks.pkl.gz')
return (hackernews_df, stocks_df)
def vectorize(dataframe):
"""Hashing vectorize dataframe by day."""
vectorizer = HashingVectorizer(
lowercase=False, tokenizer=lambda x: [x], alternate_sign=False)
day_dfs = []
for day in sorted(dataframe.index.unique()):
# Duplicate all rows based on their count.
new_df = dataframe.loc[day:day].reset_index()
new_df = new_df.loc[new_df.index.repeat(new_df['count'].astype(int))].drop(
columns='count').set_index('date')
transformed = vectorizer.transform(new_df['gram'])
day_df = pd.DataFrame(transformed.sum(axis=0), index=[day])
day_dfs.append(day_df)
final_df = pd.concat(day_dfs)
return scale(final_df, axis=1)
def prepare_dfs(up_to=None):
"""Sanitize dfs for training."""
hackernews_df, stocks_df = load_data()
# Increment days in hackernews, which will be matched with stocks for that day.
# i.e. news from day before predicts stocks of current day
hackernews_df.index = hackernews_df.index + pd.Timedelta(1, unit='D')
# Restrict training data.
if up_to:
print(f'Making model up to {up_to}')
hackernews_df = hackernews_df.loc[:up_to]
# Restrict training data to dates with known data in stocks.
hackernews_df = hackernews_df.loc[hackernews_df.index.isin(
stocks_df.index.unique())]
stocks_df = stocks_df.loc[stocks_df.index.isin(
hackernews_df.index.unique())]
stocks_df = stocks_df['regular_market_change_percent']
return vectorize(hackernews_df), stocks_df
def make_model(up_to=None, save_model=False, hackernews_df=None, stocks_df=None):
"""Make prediction model."""
if hackernews_df is None or stocks_df is None:
hackernews_df, stocks_df = prepare_dfs(up_to)
clf = LinearRegression(n_jobs=-1)
clf.fit(hackernews_df, stocks_df)
if save_model and not PRINT_ONLY:
dump(clf, f'{STORAGE_DIR}/model.joblib.gz')
return clf
def main():
"""Main."""
if LOAD_MODEL:
clf = load(f'{STORAGE_DIR}/model.joblib.gz')
else:
clf = make_model(save_model=True)
hackernews_df, stocks_df = load_data()
stocks_df = stocks_df['regular_market_change_percent']
yesterday = pd.Timestamp.now() - pd.Timedelta(1, unit='D')
yesterday_str = yesterday.strftime('%Y-%m-%d')
today_str = pd.Timestamp.now().strftime('%Y-%m-%d')
predictions = clf.predict(
vectorize(hackernews_df.loc[yesterday_str:yesterday_str]))
new_df = pd.DataFrame(
columns=stocks_df.columns, data=predictions, index=[pd.Timestamp(today_str)])
if exists(PREDICTIONS):
# Remove any old prediction dates which we don't have prices for.
# These might be weekends, etc.
old_df = pd.read_pickle(PREDICTIONS)
old_df = old_df.loc[old_df.index.isin(stocks_df.index)]
predictions_df = pd.concat([old_df, new_df])
else:
predictions_df = new_df
predictions_df.index.name = 'date'
predictions_df.columns.name = 'symbol'
# Remove any duplicate dates, keeping last.
predictions_df = predictions_df[~predictions_df.index.duplicated(
keep='last')]
if PRINT_ONLY:
print(predictions_df[::-1])
else:
predictions_df.to_pickle(PREDICTIONS)
if __name__ == '__main__':
main()