-
Notifications
You must be signed in to change notification settings - Fork 0
/
benckmark_streamload.py
126 lines (104 loc) · 5.33 KB
/
benckmark_streamload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
This script benchmarks the streaming load and full load of a parquet dataset.
"""
import torch
import pyarrow.dataset as ds
from torch.utils.data import TensorDataset, DataLoader
from parquet_loader import ParquetDataset, ParquetDataLoader
from pyinstrument.profiler import Profiler as time_profiler
from memory_profiler import profile as mem_profile
## config
path = '../synthetic_data'
num_workers = 4
batch_size = 66
# streaming load
@mem_profile
def streaming_load():
prof = time_profiler()
prof.start()
dataset = ParquetDataset(path)
dataloader = ParquetDataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
for i, batch in enumerate(dataloader):
# print(i, batch.shape)
pass
prof.stop()
print(f'streaming load: {prof.output_text(unicode=True, color=True)}')
## full load
@mem_profile
def full_load():
prof = time_profiler()
prof.start()
data = torch.from_numpy(ds.dataset(path).to_table().to_pandas().to_numpy())
dataset = TensorDataset(data)
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
for i, batch in enumerate(dataloader):
# print(i, batch.shape)
pass
prof.stop()
print(f'full load: {prof.output_text(unicode=True, color=True)}')
if __name__ == '__main__':
streaming_load()
full_load()
#######################
### Result on my PC ###
#######################
"""
streaming load:
_ ._ __/__ _ _ _ _ _/_ Recorded: 22:54:37 Samples: 3309
/_//_/// /_\ / //_// / //_'/ // Duration: 7.290 CPU time: 2.587
/ _/ v4.6.2
Program: /home/hanhui/codes/ParquetLoader/benckmark.py
7.290 streaming_load benckmark.py:17
├─ 6.797 _MultiProcessingDataLoaderIter.__next__ torch/utils/data/dataloader.py:625
│ [2 frames hidden] torch
│ 6.625 _MultiProcessingDataLoaderIter._next_data parquet_loader/dataloader.py:34
│ ├─ 6.507 _MultiProcessingDataLoaderIter._next_data torch/utils/data/dataloader.py:1298
│ │ [31 frames hidden] torch, multiprocessing, <built-in>, s...
│ │ 4.395 read <built-in>
│ └─ 0.110 squeeze_first_dim parquet_loader/dataloader.py:18
│ └─ 0.103 _VariableFunctionsClass.squeeze <built-in>
├─ 0.365 [self] benckmark.py
└─ 0.095 ParquetDataset.__init__ parquet_loader/dataset.py:22
└─ 0.075 <listcomp> parquet_loader/dataset.py:28
Filename: /home/hanhui/codes/ParquetLoader/benckmark.py
Line # Mem usage Increment Occurrences Line Contents
=============================================================
17 480.8 MiB 480.8 MiB 1 @mem_profile
18 def streaming_load():
19 480.8 MiB 0.0 MiB 1 prof = time_profiler()
20 480.8 MiB 0.0 MiB 1 prof.start()
21 561.0 MiB 80.2 MiB 1 dataset = ParquetDataset(path)
22 561.0 MiB 0.0 MiB 1 dataloader = ParquetDataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
23 553.4 MiB -146.1 MiB 1517 for i, batch in enumerate(dataloader):
24 # print(i, batch.shape)
25 553.3 MiB -138.5 MiB 1516 pass
26 553.4 MiB 0.0 MiB 1 prof.stop()
27 573.5 MiB 20.0 MiB 1 print(f'streaming load: {prof.output_text(unicode=True, color=True)}')
full load:
_ ._ __/__ _ _ _ _ _/_ Recorded: 22:54:47 Samples: 2048
/_//_/// /_\ / //_// / //_'/ // Duration: 3.042 CPU time: 6.524
/ _/ v4.6.2
Program: /home/hanhui/codes/ParquetLoader/benckmark.py
3.041 full_load benckmark.py:30
├─ 1.670 _MultiProcessingDataLoaderIter.__next__ torch/utils/data/dataloader.py:625
│ [52 frames hidden] torch, multiprocessing, <built-in>, s...
├─ 1.277 [self] benckmark.py
├─ 0.055 DataLoader.__iter__ torch/utils/data/dataloader.py:425
│ [3 frames hidden] torch
└─ 0.032 table_to_dataframe pyarrow/pandas_compat.py:760
Filename: /home/hanhui/codes/ParquetLoader/benckmark.py
Line # Mem usage Increment Occurrences Line Contents
=============================================================
30 573.5 MiB 573.5 MiB 1 @mem_profile
31 def full_load():
32 573.5 MiB 0.0 MiB 1 prof = time_profiler()
33 573.5 MiB 0.0 MiB 1 prof.start()
34 1180.9 MiB 607.4 MiB 1 data = torch.from_numpy(ds.dataset(path).to_table().to_pandas().to_numpy())
35 1180.9 MiB 0.0 MiB 1 dataset = TensorDataset(data)
36 1180.9 MiB 0.0 MiB 1 dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
37 1183.3 MiB -137055.7 MiB 1517 for i, batch in enumerate(dataloader):
38 # print(i, batch.shape)
39 1183.3 MiB -136873.6 MiB 1516 pass
40 988.7 MiB -194.6 MiB 1 prof.stop()
41 988.8 MiB 0.1 MiB 1 print(f'full load: {prof.output_text(unicode=True, color=True)}')
"""