Skip to content

Commit

Permalink
feat: support protein sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Dec 11, 2024
1 parent 0dde2f5 commit 6835aec
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 16 deletions.
39 changes: 28 additions & 11 deletions src/lib/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::color::{color_background, color_head};
use crate::color::{COLOR_BACKGROUND, COLOR_BASES, COLOR_QUALS};
use crate::reverse_complement;
use crate::DNA_BASES;
use crate::AMINO_ACIDS;
use anyhow::{bail, Context, Result};
use bstr::ByteSlice;
use regex::bytes::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
Expand Down Expand Up @@ -120,9 +121,18 @@ fn bases_colored(
}

/// Validates that a given FIXED pattern contains only valid DNA bases (ACGTN)
pub fn validate_fixed_pattern(pattern: &str) -> Result<()> {
pub fn validate_fixed_pattern(pattern: &str, protein: bool) -> Result<()> {
for (index, base) in pattern.chars().enumerate() {
if !DNA_BASES.contains(&(base as u8)) {
if protein {
if !AMINO_ACIDS.contains(&(base as u8)) {
bail!(
"Fixed pattern must contain only amino acids: {} .. [{}] .. {}",
&pattern[0..index],
&pattern[index..=index],
&pattern[index + 1..],
)
}
} else if !DNA_BASES.contains(&(base as u8)) {
bail!(
"Fixed pattern must contain only DNA bases: {} .. [{}] .. {}",
&pattern[0..index],
Expand Down Expand Up @@ -609,17 +619,24 @@ pub mod tests {
// Tests validate_fixed_pattern()
// ############################################################################################

#[test]
fn test_validate_fixed_pattern_is_ok() {
let pattern = "AGTGTGATG";
let result = validate_fixed_pattern(&pattern);
#[rstest]
#[case("AGTGTGATG", false)]
#[case("QRNQRNQRN", true)]
fn test_validate_fixed_pattern_is_ok(
#[case] pattern: &str,
#[case] protein: bool) {
let result = validate_fixed_pattern(&pattern, protein);
assert!(result.is_ok())
}
#[test]
fn test_validate_fixed_pattern_error() {
let pattern = "AXGTGTGATG";
let msg = String::from("Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG");
let result = validate_fixed_pattern(&pattern);

#[rstest]
#[case("AXGTGTGATG", false, "Fixed pattern must contain only DNA bases: A .. [X] .. GTGTGATG")]
#[case("QRNQRNZQRN", true, "Fixed pattern must contain only amino acids: QRNQRN .. [Z] .. QRN")]
fn test_validate_fixed_pattern_error(
#[case] pattern: &str,
#[case] protein: bool,
#[case] msg: &str) {
let result = validate_fixed_pattern(&pattern, protein);
let inner = result.unwrap_err().to_string();
assert_eq!(inner, msg);
}
Expand Down
1 change: 1 addition & 0 deletions src/lib/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use lazy_static::lazy_static;
use std::{borrow::Borrow, path::Path};

pub const DNA_BASES: [u8; 5] = *b"ACGTN";
pub const AMINO_ACIDS: [u8; 20] = *b"ARNDCEQGHILKMFPSTWYV";
pub const IUPAC_BASES: [u8; 15] = *b"AGCTYRWSKMDVHBN";
pub const IUPAC_BASES_COMPLEMENT: [u8; 15] = *b"TCGARYWSMKHBDVN";

Expand Down
92 changes: 87 additions & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ struct Opts {
#[structopt(long)]
progress: bool,

/// The input files contain protein sequence (amino acids), not DNA sequence.
#[structopt(long)]
protein: bool,

/// The first argument is the pattern to match, with the remaining arguments containing the
/// files to match. If `-e` is given, then all the arguments are files to match.
/// Use standard input if either no files are given or `-` is given.
Expand Down Expand Up @@ -366,13 +370,17 @@ fn fqgrep_from_opts(opts: &Opts) -> Result<usize> {
);
}

if opts.protein && opts.reverse_complement {
bail!("--reverse-complement many not be used with --protein")
}

// Validate the fixed string pattern, if fixed-strings are specified
if opts.fixed_strings {
if let Some(pattern) = &pattern {
validate_fixed_pattern(pattern)?;
validate_fixed_pattern(pattern, opts.protein)?;
} else if !opts.regexp.is_empty() {
for pattern in &opts.regexp {
validate_fixed_pattern(pattern)?;
validate_fixed_pattern(pattern, opts.protein)?;
}
} else {
bail!("A pattern must be given as a positional argument or with -e/--regexp")
Expand Down Expand Up @@ -672,6 +680,7 @@ pub mod tests {
paired: false,
reverse_complement: false,
progress: true,
protein: false,
args: fq_path.to_vec(),
output: output,
};
Expand Down Expand Up @@ -796,6 +805,46 @@ pub mod tests {
let sequence_match = expected_seq == return_sequences;
assert_eq!(sequence_match, expected_bool);
}

// ############################################################################################
//Tests match with protein (not DNA!)
// ############################################################################################
#[rstest]
// fixed strings
#[case(vec!["A"], vec!["AAAA", "ATAT", "TATA", "AAAT", "TTTA"])] // unpaired: fixed string with multiple matches
#[case(vec!["Q"], vec!["QFPQFP"])] // unpaired: fixed string with one match
#[case(vec!["M"], vec![])] // unpaired: fixed string with no matches
// regex
#[case(vec!["^Q"], vec!["QFPQFP"])] // unpaired: regex with one match
#[case(vec!["^Q", "^F"], vec!["QFPQFP"])] // unpaired: regex set with two matches
#[case(vec!["^Q", "^A"], vec!["AAAA", "ATAT", "AAAT", "QFPQFP"])] // unpaired: regex set with two matches
#[case(vec!["^M", "^K"], vec![])] // unpaired: regex set with no matches
fn test_protein_ok(
#[case] pattern: Vec<&str>,
#[case] expected_seq: Vec<&str>,
) {
let dir = TempDir::new().unwrap();
let seqs = vec![
vec!["AAAA", "TTTT", "ATAT", "TATA", "AAAT", "TTTA", "QFPQFP"],
];
let out_path = dir.path().join(String::from("output.fq"));
let result_path = &out_path.clone();
let pattern = pattern.iter().map(|&s| s.to_owned()).collect::<Vec<_>>();
let mut opts = build_opts(
&dir,
&seqs,
&pattern,
true,
Some(out_path),
String::from(".fq"),
);

opts.protein = true;
let _result = fqgrep_from_opts(&opts);
let return_sequences = slurp_output(result_path.to_path_buf());

assert_eq!(return_sequences, expected_seq);
}

// ############################################################################################
// Tests two fastqs for 'TGGATTCAGACTT' which is only found once in the reverse complement
Expand Down Expand Up @@ -831,21 +880,52 @@ pub mod tests {
let result = fqgrep_from_opts(&opts);
assert_eq!(result.unwrap(), expected);
}

//
// ############################################################################################
// Tests that an error is returned when protein and reverse_complement are both present
// ############################################################################################
#[rstest]
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: --reverse-complement many not be used with --protein"
)]
#[case()]
fn text_fails_with_protein_and_reverse_complement() {
let dir = TempDir::new().unwrap();
let seqs = vec![vec!["GGGG", "TTTT"], vec!["AAAA", "CCCC"]];
let pattern = vec![String::from("^G")];

let mut opts = build_opts(&dir, &seqs, &pattern, true, None, String::from(".fq"));

opts.protein = true;
opts.reverse_complement = true;
let _result = fqgrep_from_opts(&opts);
_result.unwrap();
}

// ############################################################################################
// Tests that an error is returned when fixed_strings is true and regex is present
// Tests that an error is returned when fixed_strings is true for DNA and regex is present
// ############################################################################################
#[rstest]
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. G"
)]
#[case(true, vec![String::from("^G")])] // panic with single regex
#[case(true, false, vec![String::from("^G")])] // panic with single regex
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only DNA bases: .. [^] .. A"
)]
#[case(true, vec![String::from("^A"),String::from("AA")])] // panic with combination of regex and fixed string
#[case(true, false, vec![String::from("^A"),String::from("AA")])] // panic with combination of regex and fixed string
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q"
)]
#[case(true, true, vec![String::from("^Q")])] // panic with single regex
#[should_panic(
expected = "called `Result::unwrap()` on an `Err` value: Fixed pattern must contain only amino acids: .. [^] .. Q"
)]
#[case(true, true, vec![String::from("^Q"),String::from("QQ")])] // panic with combination of regex and fixed string
fn test_regexp_from_fixed_string_fails_with_regex(
#[case] fixed_strings: bool,
#[case] protein: bool,
#[case] pattern: Vec<String>,
) {
let dir = TempDir::new().unwrap();
Expand All @@ -854,9 +934,11 @@ pub mod tests {
let mut opts = build_opts(&dir, &seqs, &pattern, true, None, String::from(".fq"));

opts.fixed_strings = fixed_strings;
opts.protein = protein;
let _result = fqgrep_from_opts(&opts);
_result.unwrap();
}

// ############################################################################################
// Tests error is returned from main when three records are defined as paired
// ############################################################################################
Expand Down

0 comments on commit 6835aec

Please sign in to comment.