Skip to content

Commit c785c34

Browse files
authored
Expose AllWave's full sparsification interface (#10)
Exposes all AllWave sparsification strategies through the CLI, following the AllWave naming model: - 'none' or '1.0' - align all pairs (default) - 'auto' - automatic sparsification based on sequence count - 'random:F' - random sampling with probability F (e.g., 'random:0.5') - 'connectivity:F' - Erdős-Rényi with probability F for connected component - 'tree:K,K2,F,SIZE' - tree sampling with: * K = k-nearest neighbors * K2 = k-farthest neighbors * F = additional random fraction * SIZE = kmer size for Mash distance (e.g., 'tree:3,3,0.1,16') Changes: - Updated --sparsify flag help text with all options - Changed default from '1.0' to 'none' (more intuitive) - Added parse_sparsification() method to handle all formats - Iterative mode now respects user's sparsification setting - Falls back to tree:3,3,0.1,16 for iterative mode if not specified - Backward compatible: plain floats treated as random factors (with warning) Examples: --sparsify none # All pairs --sparsify random:0.5 # 50% random sampling --sparsify tree:3,3,0.1,16 # Tree sampling (default for iterative) --sparsify connectivity:0.95 # Erdős-Rényi for connectivity
1 parent 4c406a3 commit c785c34

File tree

1 file changed

+96
-23
lines changed

1 file changed

+96
-23
lines changed

src/seqrush.rs

Lines changed: 96 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use clap::Parser;
88
use rayon::prelude::*;
99
use sha2::{Digest, Sha256};
1010
use std::collections::HashMap;
11+
use std::error::Error;
1112
use std::fs::File;
1213
use std::io::{BufRead, BufReader, BufWriter, Write};
1314
use std::sync::Arc;
@@ -64,8 +65,9 @@ pub struct Args {
6465
#[arg(long = "no-compact", default_value = "false")]
6566
pub no_compact: bool,
6667

67-
/// Sparsification factor (keep this fraction of alignment pairs, 1.0 = keep all, 'auto' for automatic)
68-
#[arg(short = 'x', long = "sparsify", default_value = "1.0")]
68+
/// Sparsification strategy: 'none', 'auto', 'random:F', 'connectivity:F', 'tree:K,K2,F,SIZE'
69+
/// Examples: 'none' (all pairs), 'random:0.5' (50% random), 'tree:3,3,0.1,16' (tree sampling)
70+
#[arg(short = 'x', long = "sparsify", default_value = "none")]
6971
pub sparsification: String,
7072

7173
/// Output alignments to PAF file
@@ -352,6 +354,63 @@ impl SeqRush {
352354
representatives.len()
353355
}
354356

357+
/// Parse sparsification strategy from string
358+
fn parse_sparsification(s: &str) -> Result<SparsificationStrategy, Box<dyn Error>> {
359+
match s {
360+
"none" | "1.0" => Ok(SparsificationStrategy::None),
361+
"auto" => Ok(SparsificationStrategy::Auto),
362+
s if s.starts_with("random:") => {
363+
let factor: f64 = s[7..].parse()
364+
.map_err(|_| format!("Invalid random factor: {}", s))?;
365+
if factor > 0.0 && factor <= 1.0 {
366+
Ok(SparsificationStrategy::Random(factor))
367+
} else {
368+
Err(format!("Random factor must be in (0.0, 1.0], got {}", factor).into())
369+
}
370+
}
371+
s if s.starts_with("connectivity:") => {
372+
let prob: f64 = s[13..].parse()
373+
.map_err(|_| format!("Invalid connectivity probability: {}", s))?;
374+
if prob > 0.0 && prob <= 1.0 {
375+
Ok(SparsificationStrategy::Connectivity(prob))
376+
} else {
377+
Err(format!("Connectivity probability must be in (0.0, 1.0], got {}", prob).into())
378+
}
379+
}
380+
s if s.starts_with("tree:") => {
381+
let parts: Vec<&str> = s[5..].split(',').collect();
382+
if parts.len() != 4 {
383+
return Err(format!("Tree sampling requires 4 values: tree:K_NEAR,K_FAR,RAND_FRAC,KMER_SIZE, got {}", s).into());
384+
}
385+
let k_nearest: usize = parts[0].parse()
386+
.map_err(|_| format!("Invalid k_nearest: {}", parts[0]))?;
387+
let k_farthest: usize = parts[1].parse()
388+
.map_err(|_| format!("Invalid k_farthest: {}", parts[1]))?;
389+
let rand_frac: f64 = parts[2].parse()
390+
.map_err(|_| format!("Invalid random_fraction: {}", parts[2]))?;
391+
let kmer_size: usize = parts[3].parse()
392+
.map_err(|_| format!("Invalid kmer_size: {}", parts[3]))?;
393+
394+
if rand_frac < 0.0 || rand_frac > 1.0 {
395+
return Err(format!("Random fraction must be in [0.0, 1.0], got {}", rand_frac).into());
396+
}
397+
if kmer_size == 0 {
398+
return Err("Kmer size must be > 0".into());
399+
}
400+
401+
Ok(SparsificationStrategy::TreeSampling(k_nearest, k_farthest, rand_frac, Some(kmer_size)))
402+
}
403+
// Backward compatibility: treat plain float as random factor
404+
s => match s.parse::<f64>() {
405+
Ok(factor) if factor > 0.0 && factor <= 1.0 => {
406+
eprintln!("Warning: Plain float deprecated. Use 'random:{}' instead", factor);
407+
Ok(SparsificationStrategy::Random(factor))
408+
}
409+
_ => Err(format!("Invalid sparsification: '{}'. Use 'none', 'auto', 'random:F', 'connectivity:F', or 'tree:K,K2,F,SIZE'", s).into())
410+
}
411+
}
412+
}
413+
355414
pub fn build_graph(&mut self, args: &Args) {
356415
println!(
357416
"Building graph with {} sequences (total length: {})",
@@ -588,21 +647,12 @@ impl SeqRush {
588647
};
589648

590649
// Parse sparsification strategy
591-
let sparsification = match args.sparsification.as_str() {
592-
"1.0" => SparsificationStrategy::None,
593-
"auto" => SparsificationStrategy::Auto,
594-
s => match s.parse::<f64>() {
595-
Ok(factor) if factor > 0.0 && factor <= 1.0 => {
596-
SparsificationStrategy::Random(factor)
597-
}
598-
_ => {
599-
eprintln!(
600-
"Invalid sparsification factor: {}. Using 1.0 (no sparsification)",
601-
s
602-
);
603-
SparsificationStrategy::None
604-
}
605-
},
650+
let sparsification = match Self::parse_sparsification(&args.sparsification) {
651+
Ok(s) => s,
652+
Err(e) => {
653+
eprintln!("Error parsing sparsification: {}. Using 'none'", e);
654+
SparsificationStrategy::None
655+
}
606656
};
607657

608658
// Create PAF writer if requested
@@ -847,20 +897,43 @@ impl SeqRush {
847897
max_divergence: None,
848898
};
849899

900+
// Parse sparsification strategy - use tree sampling if specified, otherwise default
901+
let sparsification = match Self::parse_sparsification(&args.sparsification) {
902+
Ok(s) => s,
903+
Err(e) => {
904+
eprintln!("Error parsing sparsification: {}. Using default tree:3,3,0.1,16", e);
905+
SparsificationStrategy::TreeSampling(3, 3, 0.1, Some(16))
906+
}
907+
};
908+
909+
// Extract tree sampling parameters or use defaults
910+
let (k_nearest, k_farthest, rand_frac, kmer_size) = match sparsification {
911+
SparsificationStrategy::TreeSampling(k, k_far, rf, Some(ks)) => (k, k_far, rf, ks),
912+
SparsificationStrategy::TreeSampling(k, k_far, rf, None) => (k, k_far, rf, 16),
913+
_ => {
914+
// Default to tree sampling for iterative mode
915+
eprintln!("Note: Iterative mode works best with tree sampling. Using default tree:3,3,0.1,16");
916+
(3, 3, 0.1, 16)
917+
}
918+
};
919+
850920
// Get separated pairs: tree pairs (k-nearest + k-farthest) and random pairs
851921
// Tree pairs guarantee connectivity, random pairs fill in extra detail
852922
let (tree_pairs, random_pairs) = allwave::knn_graph::extract_tree_pairs_separated(
853923
&allwave_sequences,
854-
3, // k_nearest
855-
3, // k_farthest
856-
0.1, // random_fraction
857-
16, // kmer_size
924+
k_nearest,
925+
k_farthest,
926+
rand_frac,
927+
kmer_size,
858928
);
859929

860930
println!(
861-
"Processing {} tree pairs (connectivity guarantee) + {} random pairs (detail)",
931+
"Processing {} tree pairs (k={}, k_far={}) + {} random pairs (frac={})",
862932
tree_pairs.len(),
863-
random_pairs.len()
933+
k_nearest,
934+
k_farthest,
935+
random_pairs.len(),
936+
rand_frac
864937
);
865938

866939
// Create PAF writer if requested

0 commit comments

Comments
 (0)