diff --git a/src/main/scala/com/fulcrumgenomics/fastq/TrimFastq.scala b/src/main/scala/com/fulcrumgenomics/fastq/TrimFastq.scala index c656eae8b..150da7085 100644 --- a/src/main/scala/com/fulcrumgenomics/fastq/TrimFastq.scala +++ b/src/main/scala/com/fulcrumgenomics/fastq/TrimFastq.scala @@ -46,26 +46,28 @@ import com.fulcrumgenomics.sopt._ class TrimFastq ( @arg(flag='i', doc="One or more input fastq files.") val input: Seq[PathToFastq], @arg(flag='o', doc="A matching number of output fastq files.") val output: Seq[PathToFastq], - @arg(flag='l', doc="Length to trim reads to.") val length: Int, + @arg(flag='l', doc="Length to trim reads to (either one per input fastq file, or one for all).") val length: Seq[Int], @arg(flag='x', doc="Exclude reads below the trim length.") val exclude: Boolean = false ) extends FgBioTool with LazyLogging { validate(input.size == output.size, "Number of input and output files must match.") + validate(length.size == 1 || input.size == length.size, "Number of lengths must be one or match the number of input files.") override def execute(): Unit = { var discarded: Long = 0 val progress = new ProgressLogger(this.logger, noun="records", verb="Wrote") + val lengths = if (this.length.size == 1) List.fill(this.input.size)(this.length.head) else this.length val sources = input.map(FastqSource(_)) val writers = output.map(FastqWriter(_)) while (allHaveNext(sources)) { val recs = sources.map(_.next()) - if (exclude && recs.exists(_.length < length)) { + if (exclude && recs.zip(lengths).exists { case (rec, length) => rec.length < length }) { discarded += 1 } else { - writers.iterator.zip(recs.iterator).foreach { case(w, r) => - w.write(r.trimmedTo(length)) + writers.lazyZip(recs).lazyZip(lengths).foreach { case (w: FastqWriter, r: FastqRecord, l: Int) => + w.write(r.trimmedTo(l)) progress.record() } } diff --git a/src/test/scala/com/fulcrumgenomics/fastq/TrimFastqTest.scala b/src/test/scala/com/fulcrumgenomics/fastq/TrimFastqTest.scala index aced3e23b..f76305dfd 100644 --- a/src/test/scala/com/fulcrumgenomics/fastq/TrimFastqTest.scala +++ b/src/test/scala/com/fulcrumgenomics/fastq/TrimFastqTest.scala @@ -59,9 +59,9 @@ class TrimFastqTest extends UnitSpec { } "TrimFastq" should "trim a single file and not discard any records" in { - val (r1, r2) = fqFiles + val (r1, _) = fqFiles val out = makeTempFile("trimmed.", ".fq") - new TrimFastq(input=Seq(r1), output=Seq(out), length=15, exclude=false).execute() + new TrimFastq(input=Seq(r1), output=Seq(out), length=Seq(15), exclude=false).execute() val r1Map = FastqSource(out).map(r => r.name -> r).toMap r1Map.size shouldBe 3 r1Map("10x10").length shouldBe 10 @@ -70,18 +70,18 @@ class TrimFastqTest extends UnitSpec { } it should "trim a single file and discard 2 records" in { - val (r1, r2) = fqFiles + val (r1, _) = fqFiles val out = makeTempFile("trimmed.", ".fq") - new TrimFastq(input=Seq(r1), output=Seq(out), length=15, exclude=true).execute() + new TrimFastq(input=Seq(r1), output=Seq(out), length=Seq(15), exclude=true).execute() val r1Map = FastqSource(out).map(r => r.name -> r).toMap r1Map.size shouldBe 1 r1Map("20x20").length shouldBe 15 } it should "trim a single file and discard 0 records because they are all long enough" in { - val (r1, r2) = fqFiles + val (r1, _) = fqFiles val out = makeTempFile("trimmed.", ".fq") - new TrimFastq(input=Seq(r1), output=Seq(out), length=5, exclude=true).execute() + new TrimFastq(input=Seq(r1), output=Seq(out), length=Seq(5), exclude=true).execute() val r1Map = FastqSource(out).map(r => r.name -> r).toMap r1Map.size shouldBe 3 r1Map("10x10").length shouldBe 5 @@ -92,7 +92,7 @@ class TrimFastqTest extends UnitSpec { it should "not trim or discard any reads" in { val (r1, r2) = fqFiles val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq")) - new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=25, exclude=false).execute() + new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=Seq(25), exclude=false).execute() val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap r1Map.size shouldBe 3 @@ -108,7 +108,7 @@ class TrimFastqTest extends UnitSpec { it should "trim but not discard some reads" in { val (r1, r2) = fqFiles val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq")) - new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=15, exclude=false).execute() + new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=Seq(15), exclude=false).execute() val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap r1Map.size shouldBe 3 @@ -124,7 +124,7 @@ class TrimFastqTest extends UnitSpec { it should "trim some reads and discard others by pair in" in { val (r1, r2) = fqFiles val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq")) - new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=15, exclude=true).execute() + new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=Seq(15), exclude=true).execute() val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap r1Map.size shouldBe 1 @@ -132,4 +132,32 @@ class TrimFastqTest extends UnitSpec { r1Map("20x20").length shouldBe 15 r2Map("20x20").length shouldBe 15 } + + it should "trim each FASTQ independently to a FASTQ-specific length and not discard reads" in { + val (r1, r2) = fqFiles + val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq")) + new TrimFastq(input = Seq(r1, r2), output = Seq(r1Out, r2Out), length = Seq(10, 15), exclude = false).execute() + val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap + val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap + r1Map.size shouldBe 3 + r2Map.size shouldBe r1Map.size + r1Map("10x10").length shouldBe 10 + r1Map("10x20").length shouldBe 10 + r1Map("20x20").length shouldBe 10 + r2Map("10x10").length shouldBe 10 + r2Map("10x20").length shouldBe 15 + r2Map("20x20").length shouldBe 15 + } + + it should "trim each FASTQ independently to a FASTQ-specific length and discard reads" in { + val (r1, r2) = fqFiles + val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq")) + new TrimFastq(input = Seq(r1, r2), output=Seq(r1Out, r2Out), length = Seq(20, 20), exclude = true).execute() + val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap + val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap + r1Map.size shouldBe 1 + r2Map.size shouldBe r1Map.size + r1Map("20x20").length shouldBe 20 + r2Map("20x20").length shouldBe 20 + } }