27
27
from nemo .core import Dataset
28
28
from nemo .utils import AppState , logging
29
29
30
+ try :
31
+ import multistorageclient
32
+ MULTISTORAGECLIENT_AVAILABLE = True
33
+ except (ImportError , ModuleNotFoundError ):
34
+ MULTISTORAGECLIENT_AVAILABLE = False
35
+
30
36
__all__ = ["TextMemMapDataset" , "CSVMemMapDataset" , "build_index_files" ]
31
37
__idx_version__ = "0.2" # index file version
32
38
__idx_suffix__ = "idx" # index file suffix
@@ -40,7 +46,10 @@ def _build_index_from_memdata(fn, newline_int):
40
46
Returns a 1D array of ints.
41
47
"""
42
48
# use memmap to read file
43
- mdata = np .memmap (fn , dtype = np .uint8 , mode = "r" )
49
+ if MULTISTORAGECLIENT_AVAILABLE :
50
+ mdata = multistorageclient .numpy .memmap (fn , dtype = np .uint8 , mode = "r" )
51
+ else :
52
+ mdata = np .memmap (fn , dtype = np .uint8 , mode = "r" )
44
53
# find newline positions
45
54
midx = np .where (mdata == newline_int )[0 ]
46
55
midx_dtype = midx .dtype
@@ -250,18 +259,28 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None):
250
259
idx_fn = _index_fn (fn , index_mapping_dir )
251
260
252
261
# create data map
253
- mdata = np .memmap (fn , dtype = np .uint8 , mode = "r" )
262
+ if MULTISTORAGECLIENT_AVAILABLE :
263
+ mdata = multistorageclient .numpy .memmap (fn , dtype = np .uint8 , mode = "r" )
264
+ else :
265
+ mdata = np .memmap (fn , dtype = np .uint8 , mode = "r" )
254
266
255
267
if _index_file_exists (idx_fn ):
256
268
# load index file into memory map
257
- midx = np .load (idx_fn + ".npy" , allow_pickle = True , mmap_mode = "r" )
269
+ if MULTISTORAGECLIENT_AVAILABLE :
270
+ midx = multistorageclient .numpy .load (idx_fn + ".npy" , allow_pickle = True , mmap_mode = "r" )
271
+ else :
272
+ midx = np .load (idx_fn + ".npy" , allow_pickle = True , mmap_mode = "r" )
258
273
# test for header
259
274
if len (midx ) < self ._header_lines :
260
275
raise RuntimeError (f"Missing header, expected { self ._header_lines } header lines" )
261
276
262
277
# load meta info
263
- with open (idx_fn + ".info" , "rb" ) as fp :
264
- idx_info_dict = pickle .load (fp )
278
+ if MULTISTORAGECLIENT_AVAILABLE :
279
+ with multistorageclient .open (idx_fn + ".info" , "rb" ) as fp :
280
+ idx_info_dict = multistorageclient .pickle .load (fp )
281
+ else :
282
+ with open (idx_fn + ".info" , "rb" ) as fp :
283
+ idx_info_dict = pickle .load (fp )
265
284
# test for mismatch in expected newline_int
266
285
if "newline_int" in idx_info_dict :
267
286
newline_int = idx_info_dict ["newline_int" ]
@@ -438,10 +457,12 @@ def _build_data_from_text(self, text):
438
457
439
458
def _index_file_exists (idx_fn ):
440
459
"""Helper function to test if index file exists"""
441
- if os .path .exists (idx_fn + ".npy" ) and os .path .exists (idx_fn + ".info" ):
442
- return True
460
+ is_exists = False
461
+ if MULTISTORAGECLIENT_AVAILABLE :
462
+ is_exists = multistorageclient .os .path .exists (idx_fn + ".npy" ) and multistorageclient .os .path .exists (idx_fn + ".info" )
443
463
else :
444
- return False
464
+ is_exists = os .path .exists (idx_fn + ".npy" ) and os .path .exists (idx_fn + ".info" )
465
+ return is_exists
445
466
446
467
447
468
def _index_fn (fn : str , index_mapping_dir : str ) -> str :
@@ -504,9 +525,16 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir
504
525
505
526
# save index as numpy array to enable memmap reading
506
527
logging .info (f"Saving idx file = { idx_fn } .npy" )
507
- np .save (idx_fn + ".npy" , midx , allow_pickle = True )
528
+ if MULTISTORAGECLIENT_AVAILABLE :
529
+ multistorageclient .numpy .save (idx_fn + ".npy" , midx , allow_pickle = True )
530
+ else :
531
+ np .save (idx_fn + ".npy" , midx , allow_pickle = True )
532
+
508
533
logging .info (f"Saving metadata file = { idx_fn } .info" )
509
- pickle .dump (data , open (idx_fn + ".info" , "wb" ))
534
+ if MULTISTORAGECLIENT_AVAILABLE :
535
+ multistorageclient .pickle .dump (data , idx_fn + ".info" )
536
+ else :
537
+ pickle .dump (data , open (idx_fn + ".info" , "wb" ))
510
538
511
539
return True
512
540
0 commit comments