@@ -8,6 +8,7 @@ use clap::Parser;
88use rayon:: prelude:: * ;
99use sha2:: { Digest , Sha256 } ;
1010use std:: collections:: HashMap ;
11+ use std:: error:: Error ;
1112use std:: fs:: File ;
1213use std:: io:: { BufRead , BufReader , BufWriter , Write } ;
1314use std:: sync:: Arc ;
@@ -64,8 +65,9 @@ pub struct Args {
6465 #[ arg( long = "no-compact" , default_value = "false" ) ]
6566 pub no_compact : bool ,
6667
67- /// Sparsification factor (keep this fraction of alignment pairs, 1.0 = keep all, 'auto' for automatic)
68- #[ arg( short = 'x' , long = "sparsify" , default_value = "1.0" ) ]
68+ /// Sparsification strategy: 'none', 'auto', 'random:F', 'connectivity:F', 'tree:K,K2,F,SIZE'
69+ /// Examples: 'none' (all pairs), 'random:0.5' (50% random), 'tree:3,3,0.1,16' (tree sampling)
70+ #[ arg( short = 'x' , long = "sparsify" , default_value = "none" ) ]
6971 pub sparsification : String ,
7072
7173 /// Output alignments to PAF file
@@ -352,6 +354,63 @@ impl SeqRush {
352354 representatives. len ( )
353355 }
354356
357+ /// Parse sparsification strategy from string
358+ fn parse_sparsification ( s : & str ) -> Result < SparsificationStrategy , Box < dyn Error > > {
359+ match s {
360+ "none" | "1.0" => Ok ( SparsificationStrategy :: None ) ,
361+ "auto" => Ok ( SparsificationStrategy :: Auto ) ,
362+ s if s. starts_with ( "random:" ) => {
363+ let factor: f64 = s[ 7 ..] . parse ( )
364+ . map_err ( |_| format ! ( "Invalid random factor: {}" , s) ) ?;
365+ if factor > 0.0 && factor <= 1.0 {
366+ Ok ( SparsificationStrategy :: Random ( factor) )
367+ } else {
368+ Err ( format ! ( "Random factor must be in (0.0, 1.0], got {}" , factor) . into ( ) )
369+ }
370+ }
371+ s if s. starts_with ( "connectivity:" ) => {
372+ let prob: f64 = s[ 13 ..] . parse ( )
373+ . map_err ( |_| format ! ( "Invalid connectivity probability: {}" , s) ) ?;
374+ if prob > 0.0 && prob <= 1.0 {
375+ Ok ( SparsificationStrategy :: Connectivity ( prob) )
376+ } else {
377+ Err ( format ! ( "Connectivity probability must be in (0.0, 1.0], got {}" , prob) . into ( ) )
378+ }
379+ }
380+ s if s. starts_with ( "tree:" ) => {
381+ let parts: Vec < & str > = s[ 5 ..] . split ( ',' ) . collect ( ) ;
382+ if parts. len ( ) != 4 {
383+ return Err ( format ! ( "Tree sampling requires 4 values: tree:K_NEAR,K_FAR,RAND_FRAC,KMER_SIZE, got {}" , s) . into ( ) ) ;
384+ }
385+ let k_nearest: usize = parts[ 0 ] . parse ( )
386+ . map_err ( |_| format ! ( "Invalid k_nearest: {}" , parts[ 0 ] ) ) ?;
387+ let k_farthest: usize = parts[ 1 ] . parse ( )
388+ . map_err ( |_| format ! ( "Invalid k_farthest: {}" , parts[ 1 ] ) ) ?;
389+ let rand_frac: f64 = parts[ 2 ] . parse ( )
390+ . map_err ( |_| format ! ( "Invalid random_fraction: {}" , parts[ 2 ] ) ) ?;
391+ let kmer_size: usize = parts[ 3 ] . parse ( )
392+ . map_err ( |_| format ! ( "Invalid kmer_size: {}" , parts[ 3 ] ) ) ?;
393+
394+ if rand_frac < 0.0 || rand_frac > 1.0 {
395+ return Err ( format ! ( "Random fraction must be in [0.0, 1.0], got {}" , rand_frac) . into ( ) ) ;
396+ }
397+ if kmer_size == 0 {
398+ return Err ( "Kmer size must be > 0" . into ( ) ) ;
399+ }
400+
401+ Ok ( SparsificationStrategy :: TreeSampling ( k_nearest, k_farthest, rand_frac, Some ( kmer_size) ) )
402+ }
403+ // Backward compatibility: treat plain float as random factor
404+ s => match s. parse :: < f64 > ( ) {
405+ Ok ( factor) if factor > 0.0 && factor <= 1.0 => {
406+ eprintln ! ( "Warning: Plain float deprecated. Use 'random:{}' instead" , factor) ;
407+ Ok ( SparsificationStrategy :: Random ( factor) )
408+ }
409+ _ => Err ( format ! ( "Invalid sparsification: '{}'. Use 'none', 'auto', 'random:F', 'connectivity:F', or 'tree:K,K2,F,SIZE'" , s) . into ( ) )
410+ }
411+ }
412+ }
413+
355414 pub fn build_graph ( & mut self , args : & Args ) {
356415 println ! (
357416 "Building graph with {} sequences (total length: {})" ,
@@ -588,21 +647,12 @@ impl SeqRush {
588647 } ;
589648
590649 // Parse sparsification strategy
591- let sparsification = match args. sparsification . as_str ( ) {
592- "1.0" => SparsificationStrategy :: None ,
593- "auto" => SparsificationStrategy :: Auto ,
594- s => match s. parse :: < f64 > ( ) {
595- Ok ( factor) if factor > 0.0 && factor <= 1.0 => {
596- SparsificationStrategy :: Random ( factor)
597- }
598- _ => {
599- eprintln ! (
600- "Invalid sparsification factor: {}. Using 1.0 (no sparsification)" ,
601- s
602- ) ;
603- SparsificationStrategy :: None
604- }
605- } ,
650+ let sparsification = match Self :: parse_sparsification ( & args. sparsification ) {
651+ Ok ( s) => s,
652+ Err ( e) => {
653+ eprintln ! ( "Error parsing sparsification: {}. Using 'none'" , e) ;
654+ SparsificationStrategy :: None
655+ }
606656 } ;
607657
608658 // Create PAF writer if requested
@@ -847,20 +897,43 @@ impl SeqRush {
847897 max_divergence : None ,
848898 } ;
849899
900+ // Parse sparsification strategy - use tree sampling if specified, otherwise default
901+ let sparsification = match Self :: parse_sparsification ( & args. sparsification ) {
902+ Ok ( s) => s,
903+ Err ( e) => {
904+ eprintln ! ( "Error parsing sparsification: {}. Using default tree:3,3,0.1,16" , e) ;
905+ SparsificationStrategy :: TreeSampling ( 3 , 3 , 0.1 , Some ( 16 ) )
906+ }
907+ } ;
908+
909+ // Extract tree sampling parameters or use defaults
910+ let ( k_nearest, k_farthest, rand_frac, kmer_size) = match sparsification {
911+ SparsificationStrategy :: TreeSampling ( k, k_far, rf, Some ( ks) ) => ( k, k_far, rf, ks) ,
912+ SparsificationStrategy :: TreeSampling ( k, k_far, rf, None ) => ( k, k_far, rf, 16 ) ,
913+ _ => {
914+ // Default to tree sampling for iterative mode
915+ eprintln ! ( "Note: Iterative mode works best with tree sampling. Using default tree:3,3,0.1,16" ) ;
916+ ( 3 , 3 , 0.1 , 16 )
917+ }
918+ } ;
919+
850920 // Get separated pairs: tree pairs (k-nearest + k-farthest) and random pairs
851921 // Tree pairs guarantee connectivity, random pairs fill in extra detail
852922 let ( tree_pairs, random_pairs) = allwave:: knn_graph:: extract_tree_pairs_separated (
853923 & allwave_sequences,
854- 3 , // k_nearest
855- 3 , // k_farthest
856- 0.1 , // random_fraction
857- 16 , // kmer_size
924+ k_nearest,
925+ k_farthest,
926+ rand_frac ,
927+ kmer_size,
858928 ) ;
859929
860930 println ! (
861- "Processing {} tree pairs (connectivity guarantee ) + {} random pairs (detail )" ,
931+ "Processing {} tree pairs (k={}, k_far={} ) + {} random pairs (frac={} )" ,
862932 tree_pairs. len( ) ,
863- random_pairs. len( )
933+ k_nearest,
934+ k_farthest,
935+ random_pairs. len( ) ,
936+ rand_frac
864937 ) ;
865938
866939 // Create PAF writer if requested
0 commit comments