@@ -13,11 +13,25 @@ use iox_time::TimeProvider;
13
13
use object_store:: path:: { Path , PathPart } ;
14
14
use object_store:: { ObjectStore , PutMode , PutOptions , PutPayload } ;
15
15
use observability_deps:: tracing:: { debug, error, info} ;
16
- use std:: time:: Duration ;
16
+ use std:: time:: { Duration , Instant } ;
17
17
use std:: { str:: FromStr , sync:: Arc } ;
18
18
use tokio:: sync:: Mutex ;
19
19
use tokio:: sync:: { OwnedSemaphorePermit , Semaphore , oneshot} ;
20
20
21
+ #[ derive( Debug ) ]
22
+ pub struct CreateWalObjectStoreArgs < ' a > {
23
+ pub time_provider : Arc < dyn TimeProvider > ,
24
+ pub object_store : Arc < dyn ObjectStore > ,
25
+ pub node_identifier_prefix : & ' a str ,
26
+ pub file_notifier : Arc < dyn WalFileNotifier > ,
27
+ pub config : WalConfig ,
28
+ pub last_wal_sequence_number : Option < WalFileSequenceNumber > ,
29
+ pub last_snapshot_sequence_number : Option < SnapshotSequenceNumber > ,
30
+ pub snapshotted_wal_files_to_keep : u64 ,
31
+ pub shutdown : ShutdownToken ,
32
+ pub wal_replay_concurrency_limit : Option < usize > ,
33
+ }
34
+
21
35
#[ derive( Debug ) ]
22
36
pub struct WalObjectStore {
23
37
object_store : Arc < dyn ObjectStore > ,
@@ -36,18 +50,21 @@ impl WalObjectStore {
36
50
/// Creates a new WAL. This will replay files into the notifier and trigger any snapshots that
37
51
/// exist in the WAL files that haven't been cleaned up yet.
38
52
#[ allow( clippy:: too_many_arguments) ]
39
- pub async fn new (
40
- time_provider : Arc < dyn TimeProvider > ,
41
- object_store : Arc < dyn ObjectStore > ,
42
- node_identifier_prefix : impl Into < String > + Send ,
43
- file_notifier : Arc < dyn WalFileNotifier > ,
44
- config : WalConfig ,
45
- last_wal_sequence_number : Option < WalFileSequenceNumber > ,
46
- last_snapshot_sequence_number : Option < SnapshotSequenceNumber > ,
47
- snapshotted_wal_files_to_keep : u64 ,
48
- shutdown : ShutdownToken ,
53
+ pub async fn new < ' a > (
54
+ CreateWalObjectStoreArgs {
55
+ time_provider,
56
+ object_store,
57
+ node_identifier_prefix,
58
+ file_notifier,
59
+ config,
60
+ last_wal_sequence_number,
61
+ last_snapshot_sequence_number,
62
+ snapshotted_wal_files_to_keep,
63
+ shutdown,
64
+ wal_replay_concurrency_limit,
65
+ } : CreateWalObjectStoreArgs < ' a > ,
49
66
) -> Result < Arc < Self > , crate :: Error > {
50
- let node_identifier = node_identifier_prefix. into ( ) ;
67
+ let node_identifier = node_identifier_prefix. to_string ( ) ;
51
68
let all_wal_file_paths =
52
69
load_all_wal_file_paths ( Arc :: clone ( & object_store) , node_identifier. clone ( ) ) . await ?;
53
70
let flush_interval = config. flush_interval ;
@@ -64,8 +81,12 @@ impl WalObjectStore {
64
81
shutdown. clone_cancellation_token ( ) ,
65
82
) ;
66
83
67
- wal. replay ( last_wal_sequence_number, & all_wal_file_paths)
68
- . await ?;
84
+ wal. replay (
85
+ last_wal_sequence_number,
86
+ & all_wal_file_paths,
87
+ wal_replay_concurrency_limit,
88
+ )
89
+ . await ?;
69
90
let wal = Arc :: new ( wal) ;
70
91
background_wal_flush ( Arc :: clone ( & wal) , flush_interval, shutdown) ;
71
92
@@ -128,8 +149,10 @@ impl WalObjectStore {
128
149
& self ,
129
150
last_wal_sequence_number : Option < WalFileSequenceNumber > ,
130
151
all_wal_file_paths : & [ Path ] ,
152
+ concurrency_limit : Option < usize > ,
131
153
) -> crate :: Result < ( ) > {
132
- debug ! ( "replaying" ) ;
154
+ let replay_start = Instant :: now ( ) ;
155
+ info ! ( "replaying WAL files" ) ;
133
156
let paths = self . load_existing_wal_file_paths ( last_wal_sequence_number, all_wal_file_paths) ;
134
157
135
158
let last_snapshot_sequence_number = {
@@ -148,72 +171,84 @@ impl WalObjectStore {
148
171
Ok ( verify_file_type_and_deserialize ( file_bytes) ?)
149
172
}
150
173
151
- let mut replay_tasks = Vec :: new ( ) ;
152
- for path in paths {
153
- let object_store = Arc :: clone ( & self . object_store ) ;
154
- replay_tasks. push ( tokio:: spawn ( get_contents ( object_store, path) ) ) ;
155
- }
156
-
157
- for wal_contents in replay_tasks {
158
- let wal_contents = wal_contents. await ??;
174
+ // Load N files concurrently and then replay them immediately before loading the next batch
175
+ // of N files. Since replaying has to happen _in order_ only loading the files part is
176
+ // concurrent, replaying the WAL file itself is done sequentially based on the original
177
+ // order (i.e paths, which is already sorted)
178
+ for batched in paths. chunks ( concurrency_limit. unwrap_or ( usize:: MAX ) ) {
179
+ let batched_start = Instant :: now ( ) ;
180
+ let mut results = Vec :: with_capacity ( batched. len ( ) ) ;
181
+ for path in batched {
182
+ let object_store = Arc :: clone ( & self . object_store ) ;
183
+ results. push ( tokio:: spawn ( get_contents ( object_store, path. clone ( ) ) ) ) ;
184
+ }
159
185
160
- // add this to the snapshot tracker, so we know what to clear out later if the replay
161
- // was a wal file that had a snapshot
162
- self . flush_buffer
163
- . lock ( )
164
- . await
165
- . replay_wal_period ( WalPeriod :: new (
166
- wal_contents. wal_file_number ,
167
- Timestamp :: new ( wal_contents. min_timestamp_ns ) ,
168
- Timestamp :: new ( wal_contents. max_timestamp_ns ) ,
169
- ) ) ;
170
-
171
- info ! (
172
- n_ops = %wal_contents. ops. len( ) ,
173
- min_timestamp_ns = %wal_contents. min_timestamp_ns,
174
- max_timestamp_ns = %wal_contents. max_timestamp_ns,
175
- wal_file_number = %wal_contents. wal_file_number,
176
- snapshot_details = ?wal_contents. snapshot,
177
- "replaying WAL file"
178
- ) ;
186
+ for wal_contents in results {
187
+ let wal_contents = wal_contents. await ??;
188
+ info ! (
189
+ n_ops = %wal_contents. ops. len( ) ,
190
+ min_timestamp_ns = %wal_contents. min_timestamp_ns,
191
+ max_timestamp_ns = %wal_contents. max_timestamp_ns,
192
+ wal_file_number = %wal_contents. wal_file_number,
193
+ snapshot_details = ?wal_contents. snapshot,
194
+ "replaying WAL file with details"
195
+ ) ;
179
196
180
- match wal_contents. snapshot {
181
- // This branch uses so much time
182
- None => self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ,
183
- Some ( snapshot_details) => {
184
- let snapshot_info = {
185
- let mut buffer = self . flush_buffer . lock ( ) . await ;
186
-
187
- match buffer. snapshot_tracker . snapshot ( snapshot_details. forced ) {
188
- None => None ,
189
- Some ( info) => {
190
- let semaphore = Arc :: clone ( & buffer. snapshot_semaphore ) ;
191
- let permit = semaphore. acquire_owned ( ) . await . unwrap ( ) ;
192
- Some ( ( info, permit) )
197
+ // add this to the snapshot tracker, so we know what to clear out later if the replay
198
+ // was a wal file that had a snapshot
199
+ self . flush_buffer
200
+ . lock ( )
201
+ . await
202
+ . replay_wal_period ( WalPeriod :: new (
203
+ wal_contents. wal_file_number ,
204
+ Timestamp :: new ( wal_contents. min_timestamp_ns ) ,
205
+ Timestamp :: new ( wal_contents. max_timestamp_ns ) ,
206
+ ) ) ;
207
+
208
+ match wal_contents. snapshot {
209
+ // This branch uses so much time
210
+ None => self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ,
211
+ Some ( snapshot_details) => {
212
+ let snapshot_info = {
213
+ let mut buffer = self . flush_buffer . lock ( ) . await ;
214
+
215
+ match buffer. snapshot_tracker . snapshot ( snapshot_details. forced ) {
216
+ None => None ,
217
+ Some ( info) => {
218
+ let semaphore = Arc :: clone ( & buffer. snapshot_semaphore ) ;
219
+ let permit = semaphore. acquire_owned ( ) . await . unwrap ( ) ;
220
+ Some ( ( info, permit) )
221
+ }
193
222
}
223
+ } ;
224
+ if snapshot_details. snapshot_sequence_number
225
+ <= last_snapshot_sequence_number
226
+ {
227
+ // Instead just notify about the WAL, as this snapshot has already been taken
228
+ // and WAL files may have been cleared.
229
+ self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ;
230
+ } else {
231
+ let snapshot_done = self
232
+ . file_notifier
233
+ . notify_and_snapshot ( Arc :: new ( wal_contents) , snapshot_details)
234
+ . await ;
235
+ let details = snapshot_done. await . unwrap ( ) ;
236
+ assert_eq ! ( snapshot_details, details) ;
194
237
}
195
- } ;
196
- if snapshot_details. snapshot_sequence_number <= last_snapshot_sequence_number {
197
- // Instead just notify about the WAL, as this snapshot has already been taken
198
- // and WAL files may have been cleared.
199
- self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ;
200
- } else {
201
- let snapshot_done = self
202
- . file_notifier
203
- . notify_and_snapshot ( Arc :: new ( wal_contents) , snapshot_details)
204
- . await ;
205
- let details = snapshot_done. await . unwrap ( ) ;
206
- assert_eq ! ( snapshot_details, details) ;
207
- }
208
238
209
- // if the info is there, we have wal files to delete
210
- if let Some ( ( snapshot_info, snapshot_permit) ) = snapshot_info {
211
- self . cleanup_snapshot ( snapshot_info, snapshot_permit) . await ;
239
+ // if the info is there, we have wal files to delete
240
+ if let Some ( ( snapshot_info, snapshot_permit) ) = snapshot_info {
241
+ self . cleanup_snapshot ( snapshot_info, snapshot_permit) . await ;
242
+ }
212
243
}
213
244
}
214
245
}
246
+ let batched_end = batched_start. elapsed ( ) ;
247
+ debug ! ( time_taken = ?batched_end, batch_len = ?batched. len( ) , "replaying batch completed" ) ;
215
248
}
216
249
250
+ // this is useful to know at the info level
251
+ info ! ( time_taken = ?replay_start. elapsed( ) , "completed replaying wal files" ) ;
217
252
Ok ( ( ) )
218
253
}
219
254
@@ -1216,6 +1251,7 @@ mod tests {
1216
1251
Path :: from ( "my_host/wal/00000000001.wal" ) ,
1217
1252
Path :: from ( "my_host/wal/00000000002.wal" ) ,
1218
1253
] ,
1254
+ None ,
1219
1255
)
1220
1256
. await
1221
1257
. unwrap ( ) ;
@@ -1364,7 +1400,7 @@ mod tests {
1364
1400
vec![ Path :: from( "my_host/wal/00000000003.wal" ) ]
1365
1401
) ;
1366
1402
replay_wal
1367
- . replay ( None , & [ Path :: from ( "my_host/wal/00000000003.wal" ) ] )
1403
+ . replay ( None , & [ Path :: from ( "my_host/wal/00000000003.wal" ) ] , None )
1368
1404
. await
1369
1405
. unwrap ( ) ;
1370
1406
let replay_notifier = replay_notifier
0 commit comments