Skip to content

Commit eb37cfa

Browse files
authored
Merge pull request #13 from stephenhky/yfinance
Further Update on Faster Caching
2 parents 98e6b8f + 403774e commit eb37cfa

File tree

2 files changed

+69
-14
lines changed

2 files changed

+69
-14
lines changed

finsim/data/preader.py

+68-13
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from datetime import datetime, timedelta
33
import os
44
import logging
5+
from time import sleep
6+
import glob
57

68
import pandas as pd
79
import yfinance as yf
@@ -30,18 +32,20 @@ def extract_online_yahoofinance_data(symbol, startdate, enddate):
3032

3133
oricols = df.columns
3234
df['TimeStamp'] = df.index
35+
# df['TimeStamp'] = df['TimeStamp'].dt.strftime('%Y-%m-%d')
3336
# df['Date'] = df['TimeStamp'].apply(lambda ts: ts.date())
3437
df = df[['TimeStamp'] + list(oricols)]
3538

3639
return df
3740

3841

39-
def extract_batch_online_yahoofinance_data(symbols, startdate, enddate):
42+
def extract_batch_online_yahoofinance_data(symbols, startdate, enddate, threads=True):
4043
combined_df = yf.download(
4144
' '.join(symbols),
4245
start=datetime.strptime(startdate, '%Y-%m-%d'),
4346
end=datetime.strptime(enddate, '%Y-%m-%d'),
44-
group_by='ticker'
47+
group_by='ticker',
48+
threads=threads
4549
)
4650

4751
dataframes = {}
@@ -50,6 +54,7 @@ def extract_batch_online_yahoofinance_data(symbols, startdate, enddate):
5054
df = combined_df[symbol].copy()
5155
oricols = df.columns
5256
df['TimeStamp'] = df.index
57+
# df['TimeStamp'] = df['TimeStamp'].dt.strftime('%Y-%m-%d')
5358
df = df[['TimeStamp'] + list(oricols)]
5459
dataframes[symbol] = df
5560
except:
@@ -96,6 +101,10 @@ def get_yahoofinance_data(symbol, startdate, enddate, cacheddir=None):
96101
preexist = False
97102
for row in table.where('symbol=="{}"'.format(symbol)):
98103
preexist = True
104+
# print("{} <= {}: {}".format(row['query_startdate'].decode('utf-8'), startdate,
105+
# row['query_startdate'].decode('utf-8') <= startdate))
106+
# print("{} <= {}: {}".format(row['query_enddate'].decode('utf-8'), enddate,
107+
# row['query_enddate'].decode('utf-8') >= enddate))
99108
if row['query_startdate'].decode('utf-8') <= startdate and row['query_enddate'].decode('utf-8') >= enddate:
100109
df = pd.read_hdf(os.path.join(cacheddir, '{}.h5'.format(symbol)), 'yahoodata')
101110
if len(df) > 0:
@@ -141,22 +150,68 @@ def get_yahoofinance_data(symbol, startdate, enddate, cacheddir=None):
141150
raise TypeError('Type of cacheddir has to be str, but got {} instead!'.format(type(cacheddir)))
142151

143152

144-
def generating_cached_yahoofinance_data(symbols, startdate, enddate, cacheddir, slicebatch=50):
153+
def finding_missing_symbols_in_cache(symbols, startdate, enddate, cacheddir):
154+
if not os.path.exists(os.path.join(cacheddir, METATABLE_FILENAME)):
155+
return symbols
156+
157+
# in table
158+
metatable = pd.read_hdf(os.path.join(cacheddir, METATABLE_FILENAME), 'metatable')
159+
existing_within_range_symbols = list(
160+
metatable['symbol'][
161+
(metatable['query_startdate'] <= startdate) & (metatable['query_enddate'] >= enddate)
162+
]
163+
)
164+
if logging.root.level >= logging.DEBUG:
165+
logging.debug('exisiting within range symbols')
166+
for symbol in existing_within_range_symbols:
167+
logging.debug('\t{}'.format(symbol))
168+
169+
# check what are in the cached directories
170+
existing_symbols = [
171+
os.path.basename(filepath)
172+
for filepath in glob.glob(os.path.join(cacheddir, '*.h5'))
173+
]
174+
existing_symbols = [filename[:-3] for filename in existing_symbols if filename != METATABLE_FILENAME]
175+
if logging.root.level >= logging.DEBUG:
176+
logging.debug('exisiting symbols')
177+
for symbol in existing_symbols:
178+
logging.debug('\t{}'.format(symbol))
179+
180+
existing_valid_symbols = set(existing_within_range_symbols) & set(existing_symbols)
181+
182+
return sorted(list(set(symbols) - set(existing_valid_symbols)))
183+
184+
185+
def generating_cached_yahoofinance_data(symbols, startdate, enddate, cacheddir, slicebatch=50, waittime=1, threads=True):
186+
tocache_symbols = finding_missing_symbols_in_cache(symbols, startdate, enddate, cacheddir)
187+
188+
logging.info('Total number of symbols: {}'.format(len(symbols)))
189+
logging.info('Total number of symbols needed to cache: {}'.format(len(tocache_symbols)))
145190
if not os.path.exists(cacheddir) or (os.path.exists(cacheddir) and not os.path.isdir(cacheddir)):
146191
logging.info('Creating directory: {}'.format(cacheddir))
147192
os.makedirs(cacheddir)
148-
# start as a new file
149-
logging.info('Creating file: {}'.format(os.path.join(cacheddir, METATABLE_FILENAME)))
150-
metatable_h5file = tb.open_file(os.path.join(cacheddir, METATABLE_FILENAME), 'w')
151-
table = metatable_h5file.create_table('/', 'metatable', METATABLE_ROWDES, title='metatable')
193+
if not os.path.exists(os.path.join(cacheddir, METATABLE_FILENAME)):
194+
logging.info('Creating file: {}'.format(os.path.join(cacheddir, METATABLE_FILENAME)))
195+
metatable_h5file = tb.open_file(os.path.join(cacheddir, METATABLE_FILENAME), 'w')
196+
table = metatable_h5file.create_table('/', 'metatable', METATABLE_ROWDES, title='metatable')
197+
else:
198+
metatable_h5file = tb.open_file(os.path.join(cacheddir, METATABLE_FILENAME), 'r+')
199+
table = metatable_h5file.root.metatable
152200

153-
nbsymbols = len(symbols)
201+
nbsymbols = len(tocache_symbols)
154202
for startidx in tqdm(range(0, nbsymbols, slicebatch)):
155-
dataframes = extract_batch_online_yahoofinance_data(
156-
symbols[startidx:min(startidx+slicebatch, nbsymbols)],
157-
startdate,
158-
enddate
159-
)
203+
success = False
204+
while not success:
205+
try:
206+
dataframes = extract_batch_online_yahoofinance_data(
207+
tocache_symbols[startidx:min(startidx + slicebatch, nbsymbols)],
208+
startdate,
209+
enddate,
210+
threads=threads
211+
)
212+
success = True
213+
except:
214+
sleep(waittime)
160215

161216
for symbol in dataframes:
162217
df = dataframes[symbol]

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def package_description():
1818

1919
setup(
2020
name='finsim',
21-
version="0.3.2",
21+
version="0.3.3",
2222
description="Financial simulation and inference",
2323
long_description=package_description(),
2424
long_description_content_type='text/markdown',

0 commit comments

Comments
 (0)