@@ -13,11 +13,25 @@ use iox_time::TimeProvider;
13
13
use object_store:: path:: { Path , PathPart } ;
14
14
use object_store:: { ObjectStore , PutMode , PutOptions , PutPayload } ;
15
15
use observability_deps:: tracing:: { debug, error, info} ;
16
- use std:: time:: Duration ;
16
+ use std:: time:: { Duration , Instant } ;
17
17
use std:: { str:: FromStr , sync:: Arc } ;
18
18
use tokio:: sync:: Mutex ;
19
19
use tokio:: sync:: { OwnedSemaphorePermit , Semaphore , oneshot} ;
20
20
21
+ #[ derive( Debug ) ]
22
+ pub struct CreateWalObjectStoreArgs < ' a > {
23
+ pub time_provider : Arc < dyn TimeProvider > ,
24
+ pub object_store : Arc < dyn ObjectStore > ,
25
+ pub node_identifier_prefix : & ' a str ,
26
+ pub file_notifier : Arc < dyn WalFileNotifier > ,
27
+ pub config : WalConfig ,
28
+ pub last_wal_sequence_number : Option < WalFileSequenceNumber > ,
29
+ pub last_snapshot_sequence_number : Option < SnapshotSequenceNumber > ,
30
+ pub snapshotted_wal_files_to_keep : u64 ,
31
+ pub shutdown : ShutdownToken ,
32
+ pub wal_replay_concurrency_limit : Option < usize > ,
33
+ }
34
+
21
35
#[ derive( Debug ) ]
22
36
pub struct WalObjectStore {
23
37
object_store : Arc < dyn ObjectStore > ,
@@ -35,19 +49,21 @@ pub struct WalObjectStore {
35
49
impl WalObjectStore {
36
50
/// Creates a new WAL. This will replay files into the notifier and trigger any snapshots that
37
51
/// exist in the WAL files that haven't been cleaned up yet.
38
- #[ allow( clippy:: too_many_arguments) ]
39
- pub async fn new (
40
- time_provider : Arc < dyn TimeProvider > ,
41
- object_store : Arc < dyn ObjectStore > ,
42
- node_identifier_prefix : impl Into < String > + Send ,
43
- file_notifier : Arc < dyn WalFileNotifier > ,
44
- config : WalConfig ,
45
- last_wal_sequence_number : Option < WalFileSequenceNumber > ,
46
- last_snapshot_sequence_number : Option < SnapshotSequenceNumber > ,
47
- snapshotted_wal_files_to_keep : u64 ,
48
- shutdown : ShutdownToken ,
52
+ pub async fn new < ' a > (
53
+ CreateWalObjectStoreArgs {
54
+ time_provider,
55
+ object_store,
56
+ node_identifier_prefix,
57
+ file_notifier,
58
+ config,
59
+ last_wal_sequence_number,
60
+ last_snapshot_sequence_number,
61
+ snapshotted_wal_files_to_keep,
62
+ shutdown,
63
+ wal_replay_concurrency_limit,
64
+ } : CreateWalObjectStoreArgs < ' a > ,
49
65
) -> Result < Arc < Self > , crate :: Error > {
50
- let node_identifier = node_identifier_prefix. into ( ) ;
66
+ let node_identifier = node_identifier_prefix. to_string ( ) ;
51
67
let all_wal_file_paths =
52
68
load_all_wal_file_paths ( Arc :: clone ( & object_store) , node_identifier. clone ( ) ) . await ?;
53
69
let flush_interval = config. flush_interval ;
@@ -64,8 +80,12 @@ impl WalObjectStore {
64
80
shutdown. clone_cancellation_token ( ) ,
65
81
) ;
66
82
67
- wal. replay ( last_wal_sequence_number, & all_wal_file_paths)
68
- . await ?;
83
+ wal. replay (
84
+ last_wal_sequence_number,
85
+ & all_wal_file_paths,
86
+ wal_replay_concurrency_limit,
87
+ )
88
+ . await ?;
69
89
let wal = Arc :: new ( wal) ;
70
90
background_wal_flush ( Arc :: clone ( & wal) , flush_interval, shutdown) ;
71
91
@@ -128,8 +148,10 @@ impl WalObjectStore {
128
148
& self ,
129
149
last_wal_sequence_number : Option < WalFileSequenceNumber > ,
130
150
all_wal_file_paths : & [ Path ] ,
151
+ concurrency_limit : Option < usize > ,
131
152
) -> crate :: Result < ( ) > {
132
- debug ! ( "replaying" ) ;
153
+ let replay_start = Instant :: now ( ) ;
154
+ info ! ( "replaying WAL files" ) ;
133
155
let paths = self . load_existing_wal_file_paths ( last_wal_sequence_number, all_wal_file_paths) ;
134
156
135
157
let last_snapshot_sequence_number = {
@@ -148,72 +170,84 @@ impl WalObjectStore {
148
170
Ok ( verify_file_type_and_deserialize ( file_bytes) ?)
149
171
}
150
172
151
- let mut replay_tasks = Vec :: new ( ) ;
152
- for path in paths {
153
- let object_store = Arc :: clone ( & self . object_store ) ;
154
- replay_tasks. push ( tokio:: spawn ( get_contents ( object_store, path) ) ) ;
155
- }
156
-
157
- for wal_contents in replay_tasks {
158
- let wal_contents = wal_contents. await ??;
173
+ // Load N files concurrently and then replay them immediately before loading the next batch
174
+ // of N files. Since replaying has to happen _in order_ only loading the files part is
175
+ // concurrent, replaying the WAL file itself is done sequentially based on the original
176
+ // order (i.e paths, which is already sorted)
177
+ for batched in paths. chunks ( concurrency_limit. unwrap_or ( usize:: MAX ) ) {
178
+ let batched_start = Instant :: now ( ) ;
179
+ let mut results = Vec :: with_capacity ( batched. len ( ) ) ;
180
+ for path in batched {
181
+ let object_store = Arc :: clone ( & self . object_store ) ;
182
+ results. push ( tokio:: spawn ( get_contents ( object_store, path. clone ( ) ) ) ) ;
183
+ }
159
184
160
- // add this to the snapshot tracker, so we know what to clear out later if the replay
161
- // was a wal file that had a snapshot
162
- self . flush_buffer
163
- . lock ( )
164
- . await
165
- . replay_wal_period ( WalPeriod :: new (
166
- wal_contents. wal_file_number ,
167
- Timestamp :: new ( wal_contents. min_timestamp_ns ) ,
168
- Timestamp :: new ( wal_contents. max_timestamp_ns ) ,
169
- ) ) ;
170
-
171
- info ! (
172
- n_ops = %wal_contents. ops. len( ) ,
173
- min_timestamp_ns = %wal_contents. min_timestamp_ns,
174
- max_timestamp_ns = %wal_contents. max_timestamp_ns,
175
- wal_file_number = %wal_contents. wal_file_number,
176
- snapshot_details = ?wal_contents. snapshot,
177
- "replaying WAL file"
178
- ) ;
185
+ for wal_contents in results {
186
+ let wal_contents = wal_contents. await ??;
187
+ info ! (
188
+ n_ops = %wal_contents. ops. len( ) ,
189
+ min_timestamp_ns = %wal_contents. min_timestamp_ns,
190
+ max_timestamp_ns = %wal_contents. max_timestamp_ns,
191
+ wal_file_number = %wal_contents. wal_file_number,
192
+ snapshot_details = ?wal_contents. snapshot,
193
+ "replaying WAL file with details"
194
+ ) ;
179
195
180
- match wal_contents. snapshot {
181
- // This branch uses so much time
182
- None => self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ,
183
- Some ( snapshot_details) => {
184
- let snapshot_info = {
185
- let mut buffer = self . flush_buffer . lock ( ) . await ;
186
-
187
- match buffer. snapshot_tracker . snapshot ( snapshot_details. forced ) {
188
- None => None ,
189
- Some ( info) => {
190
- let semaphore = Arc :: clone ( & buffer. snapshot_semaphore ) ;
191
- let permit = semaphore. acquire_owned ( ) . await . unwrap ( ) ;
192
- Some ( ( info, permit) )
196
+ // add this to the snapshot tracker, so we know what to clear out later if the replay
197
+ // was a wal file that had a snapshot
198
+ self . flush_buffer
199
+ . lock ( )
200
+ . await
201
+ . replay_wal_period ( WalPeriod :: new (
202
+ wal_contents. wal_file_number ,
203
+ Timestamp :: new ( wal_contents. min_timestamp_ns ) ,
204
+ Timestamp :: new ( wal_contents. max_timestamp_ns ) ,
205
+ ) ) ;
206
+
207
+ match wal_contents. snapshot {
208
+ // This branch uses so much time
209
+ None => self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ,
210
+ Some ( snapshot_details) => {
211
+ let snapshot_info = {
212
+ let mut buffer = self . flush_buffer . lock ( ) . await ;
213
+
214
+ match buffer. snapshot_tracker . snapshot ( snapshot_details. forced ) {
215
+ None => None ,
216
+ Some ( info) => {
217
+ let semaphore = Arc :: clone ( & buffer. snapshot_semaphore ) ;
218
+ let permit = semaphore. acquire_owned ( ) . await . unwrap ( ) ;
219
+ Some ( ( info, permit) )
220
+ }
193
221
}
222
+ } ;
223
+ if snapshot_details. snapshot_sequence_number
224
+ <= last_snapshot_sequence_number
225
+ {
226
+ // Instead just notify about the WAL, as this snapshot has already been taken
227
+ // and WAL files may have been cleared.
228
+ self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ;
229
+ } else {
230
+ let snapshot_done = self
231
+ . file_notifier
232
+ . notify_and_snapshot ( Arc :: new ( wal_contents) , snapshot_details)
233
+ . await ;
234
+ let details = snapshot_done. await . unwrap ( ) ;
235
+ assert_eq ! ( snapshot_details, details) ;
194
236
}
195
- } ;
196
- if snapshot_details. snapshot_sequence_number <= last_snapshot_sequence_number {
197
- // Instead just notify about the WAL, as this snapshot has already been taken
198
- // and WAL files may have been cleared.
199
- self . file_notifier . notify ( Arc :: new ( wal_contents) ) . await ;
200
- } else {
201
- let snapshot_done = self
202
- . file_notifier
203
- . notify_and_snapshot ( Arc :: new ( wal_contents) , snapshot_details)
204
- . await ;
205
- let details = snapshot_done. await . unwrap ( ) ;
206
- assert_eq ! ( snapshot_details, details) ;
207
- }
208
237
209
- // if the info is there, we have wal files to delete
210
- if let Some ( ( snapshot_info, snapshot_permit) ) = snapshot_info {
211
- self . cleanup_snapshot ( snapshot_info, snapshot_permit) . await ;
238
+ // if the info is there, we have wal files to delete
239
+ if let Some ( ( snapshot_info, snapshot_permit) ) = snapshot_info {
240
+ self . cleanup_snapshot ( snapshot_info, snapshot_permit) . await ;
241
+ }
212
242
}
213
243
}
214
244
}
245
+ let batched_end = batched_start. elapsed ( ) ;
246
+ debug ! ( time_taken = ?batched_end, batch_len = ?batched. len( ) , "replaying batch completed" ) ;
215
247
}
216
248
249
+ // this is useful to know at the info level
250
+ info ! ( time_taken = ?replay_start. elapsed( ) , "completed replaying wal files" ) ;
217
251
Ok ( ( ) )
218
252
}
219
253
@@ -1216,6 +1250,7 @@ mod tests {
1216
1250
Path :: from ( "my_host/wal/00000000001.wal" ) ,
1217
1251
Path :: from ( "my_host/wal/00000000002.wal" ) ,
1218
1252
] ,
1253
+ None ,
1219
1254
)
1220
1255
. await
1221
1256
. unwrap ( ) ;
@@ -1364,7 +1399,7 @@ mod tests {
1364
1399
vec![ Path :: from( "my_host/wal/00000000003.wal" ) ]
1365
1400
) ;
1366
1401
replay_wal
1367
- . replay ( None , & [ Path :: from ( "my_host/wal/00000000003.wal" ) ] )
1402
+ . replay ( None , & [ Path :: from ( "my_host/wal/00000000003.wal" ) ] , None )
1368
1403
. await
1369
1404
. unwrap ( ) ;
1370
1405
let replay_notifier = replay_notifier
0 commit comments