Skip to content

Commit c663d5c

Browse files
Merge pull request #113 from adamnovak/fasta-paf
Add `fasta+paf` output format and `-O` output basename option to `impg query`
2 parents 9b36856 + ec4bb6b commit c663d5c

File tree

1 file changed

+102
-36
lines changed

1 file changed

+102
-36
lines changed

src/main.rs

Lines changed: 102 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use rustc_hash::FxHashMap;
1414
use std::collections::hash_map::DefaultHasher;
1515
use std::fs::File;
1616
use std::hash::{Hash, Hasher};
17-
use std::io::{self, BufRead, BufReader, BufWriter};
17+
use std::io::{self, BufRead, BufReader, BufWriter, Write};
1818

1919
use std::num::NonZeroUsize;
2020
use std::sync::atomic::{AtomicUsize, Ordering};
@@ -197,7 +197,7 @@ impl GfaMafFastaOpts {
197197
Option<UnifiedSequenceIndex>,
198198
Option<(u8, u8, u8, u8, u8, u8)>,
199199
)> {
200-
let needs_sequence_mandatory = matches!(output_format, "gfa" | "maf" | "fasta");
200+
let needs_sequence_mandatory = matches!(output_format, "gfa" | "maf" | "fasta" | "fasta+paf");
201201
let needs_sequence_optional = output_format == "paf" && original_sequence_coordinates;
202202
let needs_poa = matches!(output_format, "gfa" | "maf");
203203

@@ -523,11 +523,15 @@ enum Args {
523523
#[clap(flatten)]
524524
query: QueryOpts,
525525

526-
/// Output format: 'auto' ('bed' for -r, 'bedpe' for -b), 'bed', 'bedpe', 'paf', 'gfa' (v1.0), 'maf', or 'fasta' ('gfa', 'maf', and 'fasta' require --sequence-files or --sequence-list)
526+
/// Output format: 'auto' ('bed' for -r, 'bedpe' for -b), 'bed', 'bedpe', 'paf', 'gfa' (v1.0), 'maf', 'fasta', or 'fasta+paf' ('gfa', 'maf', 'fasta', and 'fasta+paf' require --sequence-files or --sequence-list)
527527
#[arg(help_heading = "Output options")]
528528
#[clap(short = 'o', long, value_parser, default_value = "auto")]
529529
output_format: String,
530530

531+
/// Destination file basename, or nothing for standard output
532+
#[clap(short = 'O', long, value_parser, default_value = None)]
533+
output_basename: Option<String>,
534+
531535
#[clap(flatten)]
532536
gfa_maf_fasta: GfaMafFastaOpts,
533537

@@ -801,13 +805,14 @@ fn main() -> io::Result<()> {
801805
paf,
802806
query,
803807
output_format,
808+
output_basename,
804809
gfa_maf_fasta,
805810
} => {
806811
initialize_threads_and_log(&common);
807812

808813
validate_output_format(
809814
&output_format,
810-
&["auto", "bed", "bedpe", "paf", "gfa", "maf", "fasta"],
815+
&["auto", "bed", "bedpe", "paf", "gfa", "maf", "fasta", "fasta+paf"],
811816
)?;
812817

813818
let impg = initialize_impg(&common, &paf)?;
@@ -898,47 +903,54 @@ fn main() -> io::Result<()> {
898903
subset_filter.as_ref(),
899904
)?;
900905

906+
// TODO: Why is name an Option for all the output functions?
907+
let name_opt = Some(name);
908+
901909
// Output results based on the resolved format
902910
match resolved_output_format {
903911
"bed" => {
904912
// BED format - include the first element
905913
output_results_bed(
906914
&impg,
907915
&mut results,
908-
Some(name),
916+
&mut find_output_stream(&output_basename, "bed")?,
917+
&name_opt,
909918
query.effective_merge_distance(),
910919
query.original_sequence_coordinates,
911-
);
920+
)?;
912921
}
913922
"bedpe" => {
914923
// Skip the first element (the input range) for BEDPE output
915924
results.remove(0);
916925
output_results_bedpe(
917926
&impg,
918927
&mut results,
919-
Some(name),
928+
&mut find_output_stream(&output_basename, "bed")?,
929+
&name_opt,
920930
query.effective_merge_distance(),
921931
query.original_sequence_coordinates,
922-
);
932+
)?;
923933
}
924934
"paf" => {
925935
// Skip the first element (the input range) for PAF output
926936
results.remove(0);
927937
output_results_paf(
928938
&impg,
929939
&mut results,
930-
Some(name),
940+
&mut find_output_stream(&output_basename, "paf")?,
941+
&name_opt,
931942
query.effective_merge_distance(),
932943
query.original_sequence_coordinates,
933944
sequence_index.as_ref(),
934-
);
945+
)?;
935946
}
936947
"gfa" => {
937948
output_results_gfa(
938949
&impg,
939950
&mut results,
951+
&mut find_output_stream(&output_basename, "gfa")?,
940952
sequence_index.as_ref().unwrap(),
941-
Some(name),
953+
&name_opt,
942954
query.effective_merge_distance(),
943955
scoring_params.unwrap(),
944956
)?;
@@ -947,8 +959,9 @@ fn main() -> io::Result<()> {
947959
output_results_maf(
948960
&impg,
949961
&mut results,
962+
&mut find_output_stream(&output_basename, "maf")?,
950963
sequence_index.as_ref().unwrap(),
951-
Some(name),
964+
&name_opt,
952965
query.effective_merge_distance(),
953966
scoring_params.unwrap(),
954967
)?;
@@ -957,12 +970,35 @@ fn main() -> io::Result<()> {
957970
output_results_fasta(
958971
&impg,
959972
&mut results,
973+
&mut find_output_stream(&output_basename, "fa")?,
960974
sequence_index.as_ref().unwrap(),
961-
Some(name),
975+
&name_opt,
962976
query.effective_merge_distance(),
963977
reverse_complement,
964978
)?;
965-
}
979+
},
980+
"fasta+paf" => {
981+
output_results_fasta(
982+
&impg,
983+
&mut results,
984+
&mut find_output_stream(&output_basename, "fa")?,
985+
sequence_index.as_ref().unwrap(),
986+
&name_opt,
987+
query.effective_merge_distance(),
988+
reverse_complement,
989+
)?;
990+
// Skip the first element (the input range) for PAF output
991+
results.remove(0);
992+
output_results_paf(
993+
&impg,
994+
&mut results,
995+
&mut find_output_stream(&output_basename, "paf")?,
996+
&name_opt,
997+
query.effective_merge_distance(),
998+
query.original_sequence_coordinates,
999+
sequence_index.as_ref(),
1000+
)?;
1001+
},
9661002
_ => {
9671003
return Err(io::Error::new(
9681004
io::ErrorKind::InvalidInput,
@@ -1158,7 +1194,7 @@ fn main() -> io::Result<()> {
11581194
fn validate_selection_mode(mode: &str) -> io::Result<()> {
11591195
match mode {
11601196
"longest" | "total" => Ok(()),
1161-
mode if mode == "sample" || mode == "haplotype"
1197+
mode if mode == "sample" || mode == "haplotype"
11621198
|| mode.starts_with("sample,") || mode.starts_with("haplotype,") => Ok(()),
11631199
_ => Err(io::Error::new(
11641200
io::ErrorKind::InvalidInput,
@@ -1383,6 +1419,20 @@ fn get_auto_reader(path: &str) -> io::Result<Box<dyn BufRead>> {
13831419
Ok(Box::new(BufReader::new(reader)))
13841420
}
13851421

1422+
/// Helper function to return a Write implementer that is either standard output or a file with the
1423+
/// appropriate basename and extension. When no basename is provided, uses standard output.
1424+
fn find_output_stream(basename: &Option<String>, extension: &str) -> io::Result<Box<dyn Write>> {
1425+
// Anthropic's Claude came up with this.
1426+
match basename {
1427+
Some(name) => {
1428+
let filename = format!("{}.{}", name, extension);
1429+
let file = File::create(filename)?;
1430+
Ok(Box::new(file))
1431+
}
1432+
None => Ok(Box::new(io::stdout())),
1433+
}
1434+
}
1435+
13861436
/// Load/generate index based on common and PAF options
13871437
fn initialize_impg(common: &CommonOpts, paf: &PafOpts) -> io::Result<Impg> {
13881438
// Resolve the list of PAF files
@@ -1778,10 +1828,11 @@ fn apply_subset_filter_if_provided(
17781828
fn output_results_bed(
17791829
impg: &Impg,
17801830
results: &mut Vec<AdjustedInterval>,
1781-
name: Option<String>,
1831+
out: &mut dyn Write,
1832+
name: &Option<String>,
17821833
merge_distance: i32,
17831834
original_coordinates: bool,
1784-
) {
1835+
) -> io::Result<()> {
17851836
merge_query_adjusted_intervals(results, merge_distance, false);
17861837

17871838
for (query_interval, _, _) in results {
@@ -1801,24 +1852,28 @@ fn output_results_bed(
18011852
original_coordinates,
18021853
);
18031854

1804-
println!(
1855+
writeln!(
1856+
out,
18051857
"{}\t{}\t{}\t{}\t.\t{}",
18061858
transformed_name,
18071859
transformed_first,
18081860
transformed_last,
18091861
name.as_deref().unwrap_or("."),
18101862
strand
1811-
);
1863+
)?;
18121864
}
1865+
1866+
Ok(())
18131867
}
18141868

18151869
fn output_results_bedpe(
18161870
impg: &Impg,
18171871
results: &mut Vec<AdjustedInterval>,
1818-
name: Option<String>,
1872+
out: &mut dyn Write,
1873+
name: &Option<String>,
18191874
merge_distance: i32,
18201875
original_coordinates: bool,
1821-
) {
1876+
) -> io::Result<()> {
18221877
merge_adjusted_intervals(results, merge_distance);
18231878

18241879
for (overlap_query, cigar, overlap_target) in results {
@@ -1878,7 +1933,8 @@ fn output_results_bedpe(
18781933
.trim_end_matches('.')
18791934
.to_string();
18801935

1881-
println!(
1936+
writeln!(
1937+
out,
18821938
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t0\t{}\t+\tgi:f:{}\tbi:f:{}",
18831939
transformed_query_name,
18841940
transformed_first,
@@ -1890,18 +1946,21 @@ fn output_results_bedpe(
18901946
strand,
18911947
gi_str,
18921948
bi_str
1893-
);
1949+
)?;
18941950
}
1951+
1952+
Ok(())
18951953
}
18961954

18971955
fn output_results_paf(
18981956
impg: &Impg,
18991957
results: &mut Vec<AdjustedInterval>,
1900-
name: Option<String>,
1958+
out: &mut dyn Write,
1959+
name: &Option<String>,
19011960
merge_distance: i32,
19021961
original_coordinates: bool,
19031962
sequence_index: Option<&UnifiedSequenceIndex>,
1904-
) {
1963+
) -> io::Result<()> {
19051964
merge_adjusted_intervals(results, merge_distance);
19061965

19071966
for (overlap_query, cigar, overlap_target) in results {
@@ -1983,23 +2042,28 @@ fn output_results_paf(
19832042
.collect();
19842043

19852044
match name {
1986-
Some(ref name) => println!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tgi:f:{}\tbi:f:{}\tcg:Z:{}\tan:Z:{}",
2045+
Some(ref name) => writeln!(out,
2046+
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tgi:f:{}\tbi:f:{}\tcg:Z:{}\tan:Z:{}",
19872047
transformed_query_name, query_length, transformed_first, transformed_last, strand,
19882048
transformed_target_name, target_length, transformed_target_first, transformed_target_last,
1989-
matches, block_len, 255, gi_str, bi_str, cigar_str, name),
1990-
None => println!("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tgi:f:{}\tbi:f:{}\tcg:Z:{}",
2049+
matches, block_len, 255, gi_str, bi_str, cigar_str, name)?,
2050+
None => writeln!(out,
2051+
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tgi:f:{}\tbi:f:{}\tcg:Z:{}",
19912052
transformed_query_name, query_length, transformed_first, transformed_last, strand,
19922053
transformed_target_name, target_length, transformed_target_first, transformed_target_last,
1993-
matches, block_len, 255, gi_str, bi_str, cigar_str),
2054+
matches, block_len, 255, gi_str, bi_str, cigar_str)?,
19942055
}
19952056
}
2057+
2058+
Ok(())
19962059
}
19972060

19982061
fn output_results_gfa(
19992062
impg: &Impg,
20002063
results: &mut Vec<AdjustedInterval>,
2064+
out: &mut dyn Write,
20012065
sequence_index: &UnifiedSequenceIndex,
2002-
_name: Option<String>,
2066+
_name: &Option<String>,
20032067
merge_distance: i32,
20042068
scoring_params: (u8, u8, u8, u8, u8, u8),
20052069
) -> io::Result<()> {
@@ -2017,16 +2081,17 @@ fn output_results_gfa(
20172081
sequence_index,
20182082
scoring_params,
20192083
);
2020-
print!("{gfa_output}");
2084+
writeln!(out, "{gfa_output}")?;
20212085

20222086
Ok(())
20232087
}
20242088

20252089
fn output_results_fasta(
20262090
impg: &Impg,
20272091
results: &mut Vec<AdjustedInterval>,
2092+
out: &mut dyn Write,
20282093
sequence_index: &UnifiedSequenceIndex,
2029-
_name: Option<String>,
2094+
_name: &Option<String>,
20302095
merge_distance: i32,
20312096
reverse_complement: bool,
20322097
) -> io::Result<()> {
@@ -2079,8 +2144,8 @@ fn output_results_fasta(
20792144

20802145
// Output sequences sequentially to maintain order
20812146
for (header, sequence) in sequence_data {
2082-
println!("{header}");
2083-
println!("{sequence}");
2147+
writeln!(out, "{header}")?;
2148+
writeln!(out, "{sequence}")?;
20842149
}
20852150

20862151
Ok(())
@@ -2089,8 +2154,9 @@ fn output_results_fasta(
20892154
fn output_results_maf(
20902155
impg: &Impg,
20912156
results: &mut Vec<AdjustedInterval>,
2157+
out: &mut dyn Write,
20922158
sequence_index: &UnifiedSequenceIndex,
2093-
_name: Option<String>,
2159+
_name: &Option<String>,
20942160
merge_distance: i32,
20952161
scoring_params: (u8, u8, u8, u8, u8, u8),
20962162
) -> io::Result<()> {
@@ -2108,7 +2174,7 @@ fn output_results_maf(
21082174
sequence_index,
21092175
scoring_params,
21102176
);
2111-
print!("{maf_output}");
2177+
writeln!(out, "{maf_output}")?;
21122178

21132179
Ok(())
21142180
}

0 commit comments

Comments
 (0)