Skip to content

Commit bf68c7b

Browse files
authored
Hide advanced sorting configuration options (#11)
* Hide advanced sorting configuration options from help Simplifies the user interface by hiding detailed sorting config options. Users only see --no-sort as the main control, but advanced options remain functional if you know about them. Hidden from help (but still work): - --skip-sgd, --skip-groom, --skip-topo - --sgd-iter-max, --sgd-eta-max - --sgd-theta, --sgd-eps, --sgd-cooling-start Visible: - --no-sort (simple on/off for all sorting) This keeps the interface clean for most users while preserving advanced configuration for developers and power users. * Improve sparsification parameter naming and clean up legacy code - Updated sparsification format from 'tree:K,K2,F,SIZE' to 'tree:neighbor,stranger,random,k-mer' for clearer parameter names (neighbor=k-nearest, stranger=k-farthest) - Removed legacy sparsification parsing code that was causing confusing error messages when using tree sampling - Removed unused sparsity_threshold field from SeqRush struct - sparsification now handled entirely at the aligner level through SparsificationStrategy enum - Updated all test code to match new SeqRush::new() signature This eliminates the "Invalid sparsification value" warning that appeared when using tree sampling, making the output cleaner. * Support short forms of tree sparsification parameter Allow shorter forms of tree sampling for convenience: - tree:3 → TreeSampling(3, 0, 0.0, 16) - tree:3,2 → TreeSampling(3, 2, 0.0, 16) - tree:3,2,0.1 → TreeSampling(3, 2, 0.1, 16) - tree:3,2,0.1,21 → TreeSampling(3, 2, 0.1, 21) Defaults: - stranger count: 0 - random fraction: 0.0 - k-mer size: 16 This makes the most common case (just specifying k-nearest neighbors) much simpler to type.
1 parent c785c34 commit bf68c7b

File tree

4 files changed

+59
-74
lines changed

4 files changed

+59
-74
lines changed

src/bin/test_duplicate_edges.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ fn main() {
1818
offset: 5,
1919
};
2020

21-
let mut seqrush = SeqRush::new(vec![seq1, seq2], 0);
21+
let mut seqrush = SeqRush::new(vec![seq1, seq2]);
2222

2323
// Unite some positions to create shared nodes
2424
// Make it so node 2 and 3 are shared between sequences

src/bin/test_rc_edges.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ fn main() {
1919
offset: 4,
2020
};
2121

22-
let mut seqrush = SeqRush::new(vec![seq1, seq2], 0);
22+
let mut seqrush = SeqRush::new(vec![seq1, seq2]);
2323

2424
// Unite them via RC alignment
2525
// When seq1 is RC'd, it becomes CGAT which matches seq2

src/seqrush.rs

Lines changed: 50 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ pub struct Args {
6565
#[arg(long = "no-compact", default_value = "false")]
6666
pub no_compact: bool,
6767

68-
/// Sparsification strategy: 'none', 'auto', 'random:F', 'connectivity:F', 'tree:K,K2,F,SIZE'
69-
/// Examples: 'none' (all pairs), 'random:0.5' (50% random), 'tree:3,3,0.1,16' (tree sampling)
68+
/// Sparsification strategy: 'none', 'auto', 'random:F', 'connectivity:F', 'tree:neighbor[,stranger[,random[,k-mer]]]'
69+
/// Examples: 'none' (all pairs), 'random:0.5' (50% random), 'tree:3' (3 nearest), 'tree:3,2' (3 nearest, 2 farthest), 'tree:3,3,0.1,16' (full)
7070
#[arg(short = 'x', long = "sparsify", default_value = "none")]
7171
pub sparsification: String,
7272

@@ -87,35 +87,35 @@ pub struct Args {
8787
pub no_sort: bool,
8888

8989
/// Skip the path-guided SGD phase (Y) of the Ygs pipeline
90-
#[arg(long = "skip-sgd", default_value = "false")]
90+
#[arg(long = "skip-sgd", default_value = "false", hide = true)]
9191
pub skip_sgd: bool,
9292

9393
/// Skip the grooming phase (g) of the Ygs pipeline
94-
#[arg(long = "skip-groom", default_value = "false")]
94+
#[arg(long = "skip-groom", default_value = "false", hide = true)]
9595
pub skip_groom: bool,
9696

9797
/// Skip the topological sort phase (s) of the Ygs pipeline
98-
#[arg(long = "skip-topo", default_value = "false")]
98+
#[arg(long = "skip-topo", default_value = "false", hide = true)]
9999
pub skip_topo: bool,
100100

101101
/// Number of SGD iterations (default: 100, matching ODGI)
102-
#[arg(long = "sgd-iter-max", default_value = "100")]
102+
#[arg(long = "sgd-iter-max", default_value = "100", hide = true)]
103103
pub sgd_iter_max: u64,
104104

105105
/// SGD learning rate parameter eta_max (default: calculated from graph)
106-
#[arg(long = "sgd-eta-max")]
106+
#[arg(long = "sgd-eta-max", hide = true)]
107107
pub sgd_eta_max: Option<f64>,
108108

109109
/// SGD cooling/momentum parameter theta (default: 0.99)
110-
#[arg(long = "sgd-theta", default_value = "0.99")]
110+
#[arg(long = "sgd-theta", default_value = "0.99", hide = true)]
111111
pub sgd_theta: f64,
112112

113113
/// SGD convergence parameter eps (default: 0.01)
114-
#[arg(long = "sgd-eps", default_value = "0.01")]
114+
#[arg(long = "sgd-eps", default_value = "0.01", hide = true)]
115115
pub sgd_eps: f64,
116116

117117
/// SGD cooling start (default: 0.5)
118-
#[arg(long = "sgd-cooling-start", default_value = "0.5")]
118+
#[arg(long = "sgd-cooling-start", default_value = "0.5", hide = true)]
119119
pub sgd_cooling_start: f64,
120120

121121
/// [DEPRECATED] Use sort-groom-sort strategy (use --skip-* flags instead)
@@ -299,14 +299,13 @@ pub struct SeqRush {
299299
pub sequences: Vec<Sequence>,
300300
pub total_length: usize,
301301
pub union_find: BidirectedUnionFind,
302-
pub sparsity_threshold: u64,
303302
/// Maps each position to its orientation relative to its union representative
304303
/// true = reverse orientation relative to representative
305304
pub orientation_map: HashMap<Pos, bool>,
306305
}
307306

308307
impl SeqRush {
309-
pub fn new(sequences: Vec<Sequence>, sparsity_threshold: u64) -> Self {
308+
pub fn new(sequences: Vec<Sequence>) -> Self {
310309
// Validate that no sequences are empty
311310
for seq in &sequences {
312311
if seq.data.is_empty() {
@@ -332,7 +331,6 @@ impl SeqRush {
332331
sequences,
333332
total_length,
334333
union_find,
335-
sparsity_threshold,
336334
orientation_map: HashMap::new(),
337335
}
338336
}
@@ -379,24 +377,45 @@ impl SeqRush {
379377
}
380378
s if s.starts_with("tree:") => {
381379
let parts: Vec<&str> = s[5..].split(',').collect();
382-
if parts.len() != 4 {
383-
return Err(format!("Tree sampling requires 4 values: tree:K_NEAR,K_FAR,RAND_FRAC,KMER_SIZE, got {}", s).into());
380+
if parts.is_empty() || parts.len() > 4 {
381+
return Err(format!("Tree sampling requires 1-4 values: tree:neighbor[,stranger[,random[,k-mer]]], got {}", s).into());
384382
}
383+
384+
// Parse neighbor (required)
385385
let k_nearest: usize = parts[0].parse()
386-
.map_err(|_| format!("Invalid k_nearest: {}", parts[0]))?;
387-
let k_farthest: usize = parts[1].parse()
388-
.map_err(|_| format!("Invalid k_farthest: {}", parts[1]))?;
389-
let rand_frac: f64 = parts[2].parse()
390-
.map_err(|_| format!("Invalid random_fraction: {}", parts[2]))?;
391-
let kmer_size: usize = parts[3].parse()
392-
.map_err(|_| format!("Invalid kmer_size: {}", parts[3]))?;
393-
394-
if rand_frac < 0.0 || rand_frac > 1.0 {
395-
return Err(format!("Random fraction must be in [0.0, 1.0], got {}", rand_frac).into());
396-
}
397-
if kmer_size == 0 {
398-
return Err("Kmer size must be > 0".into());
399-
}
386+
.map_err(|_| format!("Invalid neighbor count: {}", parts[0]))?;
387+
388+
// Parse stranger (default: 0)
389+
let k_farthest: usize = if parts.len() >= 2 {
390+
parts[1].parse()
391+
.map_err(|_| format!("Invalid stranger count: {}", parts[1]))?
392+
} else {
393+
0
394+
};
395+
396+
// Parse random fraction (default: 0.0)
397+
let rand_frac: f64 = if parts.len() >= 3 {
398+
let frac = parts[2].parse()
399+
.map_err(|_| format!("Invalid random fraction: {}", parts[2]))?;
400+
if frac < 0.0 || frac > 1.0 {
401+
return Err(format!("Random fraction must be in [0.0, 1.0], got {}", frac).into());
402+
}
403+
frac
404+
} else {
405+
0.0
406+
};
407+
408+
// Parse k-mer size (default: 16)
409+
let kmer_size: usize = if parts.len() >= 4 {
410+
let size = parts[3].parse()
411+
.map_err(|_| format!("Invalid k-mer size: {}", parts[3]))?;
412+
if size == 0 {
413+
return Err("K-mer size must be > 0".into());
414+
}
415+
size
416+
} else {
417+
16
418+
};
400419

401420
Ok(SparsificationStrategy::TreeSampling(k_nearest, k_farthest, rand_frac, Some(kmer_size)))
402421
}
@@ -406,7 +425,7 @@ impl SeqRush {
406425
eprintln!("Warning: Plain float deprecated. Use 'random:{}' instead", factor);
407426
Ok(SparsificationStrategy::Random(factor))
408427
}
409-
_ => Err(format!("Invalid sparsification: '{}'. Use 'none', 'auto', 'random:F', 'connectivity:F', or 'tree:K,K2,F,SIZE'", s).into())
428+
_ => Err(format!("Invalid sparsification: '{}'. Use 'none', 'auto', 'random:F', 'connectivity:F', or 'tree:neighbor[,stranger[,random[,k-mer]]]'", s).into())
410429
}
411430
}
412431
}
@@ -1826,41 +1845,7 @@ pub fn run_seqrush(args: Args) -> Result<(), Box<dyn std::error::Error>> {
18261845
let sequences = load_sequences(&args.sequences)?;
18271846
println!("Loaded {} sequences", sequences.len());
18281847

1829-
// Calculate sparsification threshold
1830-
let sparsity_threshold = if args.sparsification == "auto" {
1831-
// Use Erdős-Rényi model: keep 10 * log(n) / n fraction of alignment pairs
1832-
let n = sequences.len() as f64;
1833-
let fraction = (10.0 * n.ln() / n).min(1.0);
1834-
println!(
1835-
"Auto sparsification: keeping {:.2}% of alignment pairs (n={})",
1836-
fraction * 100.0,
1837-
sequences.len()
1838-
);
1839-
(fraction * u64::MAX as f64) as u64
1840-
} else {
1841-
match args.sparsification.parse::<f64>() {
1842-
Ok(frac) if (0.0..=1.0).contains(&frac) => {
1843-
if frac == 1.0 {
1844-
u64::MAX
1845-
} else {
1846-
println!(
1847-
"Sparsification: keeping {:.2}% of alignment pairs",
1848-
frac * 100.0
1849-
);
1850-
(frac * u64::MAX as f64) as u64
1851-
}
1852-
}
1853-
_ => {
1854-
eprintln!(
1855-
"Invalid sparsification value: {}. Using 1.0 (no sparsification)",
1856-
args.sparsification
1857-
);
1858-
u64::MAX
1859-
}
1860-
}
1861-
};
1862-
1863-
let mut seqrush = SeqRush::new(sequences, sparsity_threshold);
1848+
let mut seqrush = SeqRush::new(sequences);
18641849
seqrush.build_graph(&args);
18651850

18661851
println!("Graph written to {}", args.output);

tests/test_mathematical_correctness.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ fn test_forward_reverse_unification() {
1111
offset: 0,
1212
};
1313

14-
let seqrush = SeqRush::new(vec![seq], 0);
14+
let seqrush = SeqRush::new(vec![seq]);
1515

1616
// Check that forward and reverse of each position have the same representative
1717
for i in 0..4 {
@@ -52,7 +52,7 @@ fn test_transitive_closure() {
5252
offset: 8,
5353
};
5454

55-
let seqrush = SeqRush::new(vec![seq1, seq2, seq3], 0);
55+
let seqrush = SeqRush::new(vec![seq1, seq2, seq3]);
5656

5757
// Unite seq1[0] with seq2[0]
5858
seqrush
@@ -90,7 +90,7 @@ fn test_single_component_per_position() {
9090
},
9191
];
9292

93-
let seqrush = SeqRush::new(sequences, 0);
93+
let seqrush = SeqRush::new(sequences);
9494

9595
// Unite matching positions
9696
seqrush
@@ -132,7 +132,7 @@ fn test_reverse_complement_alignment() {
132132
offset: 4,
133133
};
134134

135-
let seqrush = SeqRush::new(vec![seq1, seq2], 0);
135+
let seqrush = SeqRush::new(vec![seq1, seq2]);
136136

137137
// seq1 aligns to seq2 via reverse complement
138138
// seq1[0..4] RC aligns to seq2[0..4]
@@ -186,7 +186,7 @@ fn test_identical_sequences_produce_minimal_components() {
186186
},
187187
];
188188

189-
let seqrush = SeqRush::new(sequences, 0);
189+
let seqrush = SeqRush::new(sequences);
190190

191191
// Unite all matching positions
192192
seqrush
@@ -230,7 +230,7 @@ fn test_no_false_unifications() {
230230
},
231231
];
232232

233-
let seqrush = SeqRush::new(sequences, 0);
233+
let seqrush = SeqRush::new(sequences);
234234

235235
// Without any alignments, positions from different sequences should not be united
236236
let pos1 = make_pos(0, false);
@@ -264,7 +264,7 @@ fn test_partial_alignment() {
264264
offset: 12,
265265
};
266266

267-
let seqrush = SeqRush::new(vec![seq1, seq2], 0);
267+
let seqrush = SeqRush::new(vec![seq1, seq2]);
268268

269269
// Only the GGGG region matches (positions 4-7 in seq1, 4-7 in seq2)
270270
seqrush

0 commit comments

Comments
 (0)