@@ -13,51 +13,42 @@ use std::path::Path;
13
13
use std:: sync:: Arc ;
14
14
use tokio:: sync:: { Mutex , Semaphore } ;
15
15
use tokio:: task;
16
+ use sysinfo:: { System , SystemExt } ;
16
17
17
18
/// How many records to read from the archives before attempting insert
18
- static LOAD_QUEUE_SIZE : usize = 1000 ;
19
+ // static LOAD_QUEUE_SIZE: usize = 1000;
19
20
/// When we attempt insert, the chunks of txs that go in to each query
20
21
static QUERY_BATCH_SIZE : usize = 250 ;
21
22
22
23
/// from a tgz file decompress all the .json files in archive
23
24
/// and then read into the warehouse record format
24
- pub async fn decompress_and_extract ( tgz_file : & Path , pool : & Graph ) -> Result < u64 > {
25
+ pub async fn single_thread_decompress_extract ( tgz_file : & Path , pool : & Graph ) -> Result < u64 > {
25
26
let temppath = decompress_to_temppath ( tgz_file) ?;
26
27
let json_vec = list_all_json_files ( temppath. path ( ) ) ?;
27
28
28
29
let mut found_count = 0u64 ;
29
30
let mut created_count = 0u64 ;
30
31
31
32
let mut unique_functions: Vec < String > = vec ! [ ] ;
32
- // fill to BATCH_SIZE before attempting insert.
33
- // many files may only have a handful of user txs,
34
- // so individual files may have far fewer than BATCH_SIZE.
35
- let mut queue: Vec < WarehouseTxMaster > = vec ! [ ] ;
36
33
37
34
for j in json_vec {
38
- if let Ok ( ( mut r, _e, _) ) = extract_v5_json_rescue ( & j) {
39
- queue. append ( & mut r) ;
40
- }
35
+ let ( records, _, unique) = extract_v5_json_rescue ( & j) ?;
41
36
42
- queue . iter ( ) . for_each ( |s | {
43
- if !unique_functions. contains ( & s . function ) {
44
- unique_functions. push ( s . function . clone ( ) ) ;
37
+ unique . iter ( ) . for_each ( |f | {
38
+ if !unique_functions. contains ( & f ) {
39
+ unique_functions. push ( f . clone ( ) ) ;
45
40
}
46
41
} ) ;
47
42
48
- if queue. len ( ) >= LOAD_QUEUE_SIZE {
49
- let drain: Vec < WarehouseTxMaster > = std:: mem:: take ( & mut queue) ;
50
-
51
- let res = tx_batch (
52
- & drain,
53
- pool,
54
- QUERY_BATCH_SIZE ,
55
- j. file_name ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ,
56
- )
57
- . await ?;
58
- created_count += res. created_tx as u64 ;
59
- found_count += drain. len ( ) as u64 ;
60
- }
43
+ let res = tx_batch (
44
+ & records,
45
+ pool,
46
+ QUERY_BATCH_SIZE ,
47
+ j. file_name ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ,
48
+ )
49
+ . await ?;
50
+ created_count += res. created_tx as u64 ;
51
+ found_count += records. len ( ) as u64 ;
61
52
}
62
53
63
54
info ! ( "V5 transactions found: {}" , found_count) ;
@@ -317,3 +308,78 @@ pub async fn rip(start_dir: &Path, pool: &Graph) -> Result<u64> {
317
308
}
318
309
Ok ( txs)
319
310
}
311
+
312
+ pub async fn rip_concurrent ( start_dir : & Path , pool : & Graph ) -> Result < ( ) > {
313
+ let tgz_list = list_all_tgz_archives ( start_dir) ?;
314
+ info ! ( "tgz archives found: {}" , & tgz_list. len( ) ) ;
315
+
316
+ let tasks: Vec < _ > = tgz_list
317
+ . into_iter ( )
318
+ . map ( |p| {
319
+ let pool = pool. clone ( ) ; // Clone pool for each task
320
+ tokio:: spawn ( async move {
321
+ single_thread_decompress_extract ( & p, & pool) . await // Call the async function
322
+ } )
323
+ } )
324
+ . collect ( ) ;
325
+
326
+ // Await all tasks and handle results
327
+ let results = futures:: future:: join_all ( tasks) . await ;
328
+ // Check for errors
329
+ for ( i, result) in results. into_iter ( ) . enumerate ( ) {
330
+ match result {
331
+ Ok ( Ok ( _) ) => {
332
+ info ! ( "Task {} completed successfully." , i) ;
333
+ }
334
+ Ok ( Err ( e) ) => {
335
+ error ! ( "Task {} failed: {:?}" , i, e) ;
336
+ }
337
+ Err ( e) => {
338
+ error ! ( "Task {} panicked: {:?}" , i, e) ;
339
+ }
340
+ }
341
+ }
342
+ Ok ( ( ) )
343
+ }
344
+
345
+
346
+ const MAX_CONCURRENT_TASKS : usize = 4 ; // Define the limit for concurrent tasks
347
+
348
+ pub async fn rip_concurrent_limited ( start_dir : & Path , pool : & Graph ) -> Result < ( ) > {
349
+ let tgz_list = list_all_tgz_archives ( start_dir) ?;
350
+ info ! ( "tgz archives found: {}" , tgz_list. len( ) ) ;
351
+
352
+ let semaphore = Arc :: new ( Semaphore :: new ( MAX_CONCURRENT_TASKS ) ) ; // Semaphore to limit concurrency
353
+ let mut tasks = vec ! [ ] ;
354
+
355
+ for p in tgz_list. into_iter ( ) {
356
+ let pool = pool. clone ( ) ; // Clone pool for each task
357
+ let semaphore = Arc :: clone ( & semaphore) ; // Clone semaphore for each task
358
+
359
+ let task = tokio:: spawn ( async move {
360
+ let _permit = semaphore. acquire ( ) . await ; // Acquire semaphore permit
361
+ single_thread_decompress_extract ( & p, & pool) . await // Perform the task
362
+ } ) ;
363
+
364
+ tasks. push ( task) ;
365
+ }
366
+
367
+ // Await all tasks and handle results
368
+ let results = futures:: future:: join_all ( tasks) . await ;
369
+
370
+ for ( i, result) in results. into_iter ( ) . enumerate ( ) {
371
+ match result {
372
+ Ok ( Ok ( _) ) => {
373
+ info ! ( "Task {} completed successfully." , i) ;
374
+ }
375
+ Ok ( Err ( e) ) => {
376
+ error ! ( "Task {} failed: {:?}" , i, e) ;
377
+ }
378
+ Err ( e) => {
379
+ error ! ( "Task {} panicked: {:?}" , i, e) ;
380
+ }
381
+ }
382
+ }
383
+
384
+ Ok ( ( ) )
385
+ }
0 commit comments