2
2
from datetime import datetime , timedelta
3
3
import os
4
4
import logging
5
+ from time import sleep
6
+ import glob
5
7
6
8
import pandas as pd
7
9
import yfinance as yf
@@ -30,18 +32,20 @@ def extract_online_yahoofinance_data(symbol, startdate, enddate):
30
32
31
33
oricols = df .columns
32
34
df ['TimeStamp' ] = df .index
35
+ # df['TimeStamp'] = df['TimeStamp'].dt.strftime('%Y-%m-%d')
33
36
# df['Date'] = df['TimeStamp'].apply(lambda ts: ts.date())
34
37
df = df [['TimeStamp' ] + list (oricols )]
35
38
36
39
return df
37
40
38
41
39
- def extract_batch_online_yahoofinance_data (symbols , startdate , enddate ):
42
+ def extract_batch_online_yahoofinance_data (symbols , startdate , enddate , threads = True ):
40
43
combined_df = yf .download (
41
44
' ' .join (symbols ),
42
45
start = datetime .strptime (startdate , '%Y-%m-%d' ),
43
46
end = datetime .strptime (enddate , '%Y-%m-%d' ),
44
- group_by = 'ticker'
47
+ group_by = 'ticker' ,
48
+ threads = threads
45
49
)
46
50
47
51
dataframes = {}
@@ -50,6 +54,7 @@ def extract_batch_online_yahoofinance_data(symbols, startdate, enddate):
50
54
df = combined_df [symbol ].copy ()
51
55
oricols = df .columns
52
56
df ['TimeStamp' ] = df .index
57
+ # df['TimeStamp'] = df['TimeStamp'].dt.strftime('%Y-%m-%d')
53
58
df = df [['TimeStamp' ] + list (oricols )]
54
59
dataframes [symbol ] = df
55
60
except :
@@ -96,6 +101,10 @@ def get_yahoofinance_data(symbol, startdate, enddate, cacheddir=None):
96
101
preexist = False
97
102
for row in table .where ('symbol=="{}"' .format (symbol )):
98
103
preexist = True
104
+ # print("{} <= {}: {}".format(row['query_startdate'].decode('utf-8'), startdate,
105
+ # row['query_startdate'].decode('utf-8') <= startdate))
106
+ # print("{} <= {}: {}".format(row['query_enddate'].decode('utf-8'), enddate,
107
+ # row['query_enddate'].decode('utf-8') >= enddate))
99
108
if row ['query_startdate' ].decode ('utf-8' ) <= startdate and row ['query_enddate' ].decode ('utf-8' ) >= enddate :
100
109
df = pd .read_hdf (os .path .join (cacheddir , '{}.h5' .format (symbol )), 'yahoodata' )
101
110
if len (df ) > 0 :
@@ -141,22 +150,68 @@ def get_yahoofinance_data(symbol, startdate, enddate, cacheddir=None):
141
150
raise TypeError ('Type of cacheddir has to be str, but got {} instead!' .format (type (cacheddir )))
142
151
143
152
144
- def generating_cached_yahoofinance_data (symbols , startdate , enddate , cacheddir , slicebatch = 50 ):
153
+ def finding_missing_symbols_in_cache (symbols , startdate , enddate , cacheddir ):
154
+ if not os .path .exists (os .path .join (cacheddir , METATABLE_FILENAME )):
155
+ return symbols
156
+
157
+ # in table
158
+ metatable = pd .read_hdf (os .path .join (cacheddir , METATABLE_FILENAME ), 'metatable' )
159
+ existing_within_range_symbols = list (
160
+ metatable ['symbol' ][
161
+ (metatable ['query_startdate' ] <= startdate ) & (metatable ['query_enddate' ] >= enddate )
162
+ ]
163
+ )
164
+ if logging .root .level >= logging .DEBUG :
165
+ logging .debug ('exisiting within range symbols' )
166
+ for symbol in existing_within_range_symbols :
167
+ logging .debug ('\t {}' .format (symbol ))
168
+
169
+ # check what are in the cached directories
170
+ existing_symbols = [
171
+ os .path .basename (filepath )
172
+ for filepath in glob .glob (os .path .join (cacheddir , '*.h5' ))
173
+ ]
174
+ existing_symbols = [filename [:- 3 ] for filename in existing_symbols if filename != METATABLE_FILENAME ]
175
+ if logging .root .level >= logging .DEBUG :
176
+ logging .debug ('exisiting symbols' )
177
+ for symbol in existing_symbols :
178
+ logging .debug ('\t {}' .format (symbol ))
179
+
180
+ existing_valid_symbols = set (existing_within_range_symbols ) & set (existing_symbols )
181
+
182
+ return sorted (list (set (symbols ) - set (existing_valid_symbols )))
183
+
184
+
185
+ def generating_cached_yahoofinance_data (symbols , startdate , enddate , cacheddir , slicebatch = 50 , waittime = 1 , threads = True ):
186
+ tocache_symbols = finding_missing_symbols_in_cache (symbols , startdate , enddate , cacheddir )
187
+
188
+ logging .info ('Total number of symbols: {}' .format (len (symbols )))
189
+ logging .info ('Total number of symbols needed to cache: {}' .format (len (tocache_symbols )))
145
190
if not os .path .exists (cacheddir ) or (os .path .exists (cacheddir ) and not os .path .isdir (cacheddir )):
146
191
logging .info ('Creating directory: {}' .format (cacheddir ))
147
192
os .makedirs (cacheddir )
148
- # start as a new file
149
- logging .info ('Creating file: {}' .format (os .path .join (cacheddir , METATABLE_FILENAME )))
150
- metatable_h5file = tb .open_file (os .path .join (cacheddir , METATABLE_FILENAME ), 'w' )
151
- table = metatable_h5file .create_table ('/' , 'metatable' , METATABLE_ROWDES , title = 'metatable' )
193
+ if not os .path .exists (os .path .join (cacheddir , METATABLE_FILENAME )):
194
+ logging .info ('Creating file: {}' .format (os .path .join (cacheddir , METATABLE_FILENAME )))
195
+ metatable_h5file = tb .open_file (os .path .join (cacheddir , METATABLE_FILENAME ), 'w' )
196
+ table = metatable_h5file .create_table ('/' , 'metatable' , METATABLE_ROWDES , title = 'metatable' )
197
+ else :
198
+ metatable_h5file = tb .open_file (os .path .join (cacheddir , METATABLE_FILENAME ), 'r+' )
199
+ table = metatable_h5file .root .metatable
152
200
153
- nbsymbols = len (symbols )
201
+ nbsymbols = len (tocache_symbols )
154
202
for startidx in tqdm (range (0 , nbsymbols , slicebatch )):
155
- dataframes = extract_batch_online_yahoofinance_data (
156
- symbols [startidx :min (startidx + slicebatch , nbsymbols )],
157
- startdate ,
158
- enddate
159
- )
203
+ success = False
204
+ while not success :
205
+ try :
206
+ dataframes = extract_batch_online_yahoofinance_data (
207
+ tocache_symbols [startidx :min (startidx + slicebatch , nbsymbols )],
208
+ startdate ,
209
+ enddate ,
210
+ threads = threads
211
+ )
212
+ success = True
213
+ except :
214
+ sleep (waittime )
160
215
161
216
for symbol in dataframes :
162
217
df = dataframes [symbol ]
0 commit comments