fulcrumgenomics · nh13 · Aug 22, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023
@@ -44,6 +44,8 @@ Aggregated cluster of breakpoint pileups
 |right_frequency|Option[Double]|Proportion of reads mapping at one of the right breakends that support a breakpoint|
 |left_overlaps_target|Boolean|True if the left aggregated region overlaps a target region|
 |right_overlaps_target|Boolean|True if the right aggregated region overlaps a target region|
+|left_targets|Option[String]|The comma-delimited list of target names overlapping the left breakpoint|
+|right_targets|Option[String]|The comma-delimited list of target names overlapping the right breakpoint|
 
 
 ### BreakpointPileup

@@ -23,13 +23,15 @@ breakpoint occurs within a mapped read, and read-pair evidence, where a breakpoi
 insert between two paired reads. Currently this tool treats both forms of evidence equally, despite the
 inaccuracy of positions reported by `SvPileup` for read-pair evidence.
 
-If a bam file is provided, each aggregated pileup is annotated with the allele frequency at its left and right
+If a BAM file is provided, each aggregated pileup is annotated with the allele frequency at its left and right
 breakends. Allele frequency is defined as the total number of templates supporting constituent breakpoints divided
 by the count of the union of templates that cross any constituent breakends. In particular, paired templates that
 straddle a breakend are considered to cross the breakend.
 
-If a bed file of target regions is provided, each aggregated pileup is annotated with whether its left and
-right sides overlap a target region.
+If a BED file of target regions is provided, each aggregated pileup is annotated with whether its left and
+right sides overlap a target region.  Additionally, the names of the overlapping target regions will be copied
+annotated, overwriting any values from the `SvPileup` input.  If no target regions are provided, then the names
+of the overlapping target regions are copied from the `SvPiluep` input (if present).
 
 The output file is a tab-delimited table with one record per aggregated cluster of pileups. Aggregated
 pileups are reported with the minimum and maximum (inclusive) coordinates of all pileups in the cluster, a
@@ -42,10 +44,10 @@ pileups in the cluster.
 |----|----|----|-----------|---------|----------|----------------|
 |input|i|FilePath|Input text file of pileups generated by SvPileup|Required|1||
 |bam|b|PathToBam|Bam file for allele frequency analysis. Must be the same file that was used as input to SvPileup.|Optional|1||
-|flank|f|Int|If bam file is provided: distance upstream and downstream of aggregated breakpoint regions to search for mapped templates that overlap breakends. These are the templates that will be partitioned into those supporting the breakpoint vs. reading through it for the allele frequency calculation. Recommended to use at least the max read pair inner distance used by SvPileup.|Optional|1|1000|
-|min-breakpoint-support||Int|If bam file is provided: minimum total number of templates supporting an aggregated breakpoint to report allele frequency. Supports speed improvement by avoiding querying and iterating over huge read pileups that contain insufficient support for a breakpoint to be considered interesting.|Optional|1|10|
-|min-frequency||Double|If bam file is provided: minimum allele frequency to report. Supports speed improvement by avoiding iterating over huge read pileups that contain insufficient support for a breakpoint to be considered interesting.|Optional|1|0.001|
-|targets-bed|t|FilePath|Optional bed file of target regions|Optional|1||
+|flank|f|Int|If BAM file is provided: distance upstream and downstream of aggregated breakpoint regions to search for mapped templates that overlap breakends. These are the templates that will be partitioned into those supporting the breakpoint vs. reading through it for the allele frequency calculation. Recommended to use at least the max read pair inner distance used by SvPileup.|Optional|1|1000|
+|min-breakpoint-support||Int|If BAM file is provided: minimum total number of templates supporting an aggregated breakpoint to report allele frequency. Supports speed improvement by avoiding querying and iterating over huge read pileups that contain insufficient support for a breakpoint to be considered interesting.|Optional|1|10|
+|min-frequency||Double|If BAM file is provided: minimum allele frequency to report. Supports speed improvement by avoiding iterating over huge read pileups that contain insufficient support for a breakpoint to be considered interesting.|Optional|1|0.001|
+|targets-bed|t|FilePath|Optional BED file of target regions|Optional|1||
 |output|o|FilePath|Output file|Required|1||
 |max-dist|d|Int|Distance threshold below which to cluster breakpoints|Optional|1|10|
 
@@ -4,7 +4,7 @@ title: fgsv tools
 
 # fgsv tools
 
-The following tools are available in fgsv version 20230516-51c773a.
+The following tools are available in fgsv version 0.0.2-07c651c.
 ## All tools
 
 All tools.

@@ -11,7 +11,7 @@ import com.fulcrumgenomics.sv.{BreakpointPileup, Intervals}
 import com.fulcrumgenomics.util.{Io, Metric}
 import htsjdk.samtools.util.{Interval, OverlapDetector}
 import htsjdk.tribble.bed.BEDFeature
-
+import com.fulcrumgenomics.FgBioDef.javaIterableToIterator
 import scala.collection.mutable
 
 
@@ -33,13 +33,15 @@ import scala.collection.mutable
     |insert between two paired reads. Currently this tool treats both forms of evidence equally, despite the
     |inaccuracy of positions reported by `SvPileup` for read-pair evidence.
     |
-    |If a bam file is provided, each aggregated pileup is annotated with the allele frequency at its left and right
+    |If a BAM file is provided, each aggregated pileup is annotated with the allele frequency at its left and right
     |breakends. Allele frequency is defined as the total number of templates supporting constituent breakpoints divided
     |by the count of the union of templates that cross any constituent breakends. In particular, paired templates that
     |straddle a breakend are considered to cross the breakend.
     |
-    |If a bed file of target regions is provided, each aggregated pileup is annotated with whether its left and
-    |right sides overlap a target region.
+    |If a BED file of target regions is provided, each aggregated pileup is annotated with whether its left and
+    |right sides overlap a target region.  Additionally, the names of the overlapping target regions will be copied
+    |annotated, overwriting any values from the `SvPileup` input.  If no target regions are provided, then the names
+    |of the overlapping target regions are copied from the `SvPiluep` input (if present).
     |
     |The output file is a tab-delimited table with one record per aggregated cluster of pileups. Aggregated
     |pileups are reported with the minimum and maximum (inclusive) coordinates of all pileups in the cluster, a
@@ -50,17 +52,17 @@ class AggregateSvPileup
 (@arg(flag='i', doc="Input text file of pileups generated by SvPileup") input: FilePath,
  @arg(flag='b', doc="Bam file for allele frequency analysis. Must be the same file that was used as input to SvPileup.")
  bam: Option[PathToBam] = None,
- @arg(flag='f', doc="If bam file is provided: distance upstream and downstream of aggregated breakpoint regions to " +
+ @arg(flag='f', doc="If BAM file is provided: distance upstream and downstream of aggregated breakpoint regions to " +
    "search for mapped templates that overlap breakends. These are the templates that will be partitioned into those " +
    "supporting the breakpoint vs. reading through it for the allele frequency calculation. Recommended to use at " +
    "least the max read pair inner distance used by SvPileup.") flank: Int = 1000,
- @arg(doc="If bam file is provided: minimum total number of templates supporting an aggregated breakpoint to report " +
+ @arg(doc="If BAM file is provided: minimum total number of templates supporting an aggregated breakpoint to report " +
    "allele frequency. Supports speed improvement by avoiding querying and iterating over huge read pileups that " +
    "contain insufficient support for a breakpoint to be considered interesting.") minBreakpointSupport: Int = 10,
- @arg(doc="If bam file is provided: minimum allele frequency to report. Supports speed improvement by " +
+ @arg(doc="If BAM file is provided: minimum allele frequency to report. Supports speed improvement by " +
    "avoiding iterating over huge read pileups that contain insufficient support for a breakpoint to be considered " +
    "interesting.") minFrequency: Double = 0.001,
- @arg(flag='t', doc="Optional bed file of target regions") targetsBed: Option[FilePath] = None,
+ @arg(flag='t', doc="Optional BED file of target regions") targetsBed: Option[FilePath] = None,
  @arg(flag='o', doc="Output file") output: FilePath,
  @arg(flag='d', doc="Distance threshold below which to cluster breakpoints") maxDist: Int = 10,
 ) extends SvTool with LazyLogging {
@@ -74,10 +76,10 @@ class AggregateSvPileup
     val pileups: PileupGroup = Metric.read[BreakpointPileup](input).toIndexedSeq
     logger.info(f"Read ${pileups.length} pileups from file $input")
 
-    // Open bam reader
+    // Open BAM reader
     val samSource: Option[SamSource] = bam.map(SamSource(_))
 
-    // Read targets from bed file and build OverlapDetector
+    // Read targets from BED file and build OverlapDetector
     val targets: Option[OverlapDetector[BEDFeature]] = targetsBed.map(Intervals.overlapDetectorFrom)
 
     // Open output writer
@@ -245,6 +247,8 @@ object BreakpointCategory extends Enumeration {
  * @param right_frequency       Proportion of reads mapping at one of the right breakends that support a breakpoint
  * @param left_overlaps_target  True if the left aggregated region overlaps a target region
  * @param right_overlaps_target True if the right aggregated region overlaps a target region
+ * @param left_targets  the comma-delimited list of target names overlapping the left breakpoint
+ * @param right_targets the comma-delimited list of target names overlapping the right breakpoint
  */
 case class AggregatedBreakpointPileup(id: String,
                                       category: BreakpointCategory,
@@ -265,6 +269,8 @@ case class AggregatedBreakpointPileup(id: String,
                                       right_frequency: Option[Double] = None,
                                       left_overlaps_target: Boolean = false,
                                       right_overlaps_target: Boolean = false,
+                                      left_targets: Option[String] = None,
+                                      right_targets: Option[String] = None
                                      ) extends Metric with LazyLogging {
 
   /** Returns the IDs of constituent breakpoints */
@@ -276,6 +282,18 @@ case class AggregatedBreakpointPileup(id: String,
     assert(pileup.right_contig == right_contig)
     assert(pileup.left_strand == left_strand)
     assert(pileup.right_strand == right_strand)
+
+    val left_targets = (this.left_targets, pileup.left_targets) match {
+      case (_, None) => this.left_targets
+      case (None, _) => pileup.left_targets
+      case (Some(this_left), Some(pileup_left)) => Some(f"${this_left},${pileup_left}")
+    }
+    val right_targets = (this.right_targets, pileup.right_targets) match {
+      case (_, None) => this.right_targets
+      case (None, _) => pileup.right_targets
+      case (Some(this_right), Some(pileup_right)) => Some(f"${this_right},${pileup_right}")
+    }
+
     AggregatedBreakpointPileup(
       id              = pileupIds().appended(pileup.id).sorted.mkString("_"),
       category        = category,
@@ -292,6 +310,8 @@ case class AggregatedBreakpointPileup(id: String,
       total           = total + pileup.total,
       left_pileups    = left_pileups.appended(pileup.left_pos),
       right_pileups   = right_pileups.appended(pileup.right_pos),
+      left_targets    = left_targets,
+      right_targets   = right_targets,
     )
   }
 
@@ -411,13 +431,29 @@ case class AggregatedBreakpointPileup(id: String,
    * @param targets Optional OverlapDetector for target intervals. If None, target overlap fields are set to false.
    */
   def addTargetOverlap(targets: Option[OverlapDetector[BEDFeature]]): AggregatedBreakpointPileup = {
-    val (left_overlaps, right_overlaps) = targets match {
-      case None => (false, false)
-      case Some(t) =>
-        (overlapsTarget(left_contig, left_min_pos, left_max_pos, t),
-          overlapsTarget(right_contig, right_min_pos, right_max_pos, t))
+
+    def annotations(contig: String, minPos: Int , maxPos: Int, defaultTargets: Option[String] = None): (Boolean, Option[String]) = {
+      targets match {
+        case None => (false, defaultTargets)
+        case Some(detector) =>
+          val span = new Interval(contig, minPos, maxPos)
+          if (detector.overlapsAny(span)) {
+            val overlaps = detector.getOverlaps(span)
+            (true, Some(overlaps.map(_.getName).toSeq.sorted.distinct.mkString(",")))
+          } else {
+            (false, None)
+          }
+      }
     }
-    this.copy(left_overlaps_target = left_overlaps, right_overlaps_target = right_overlaps)
+
+    val (leftOverlaps, leftTargets)   = annotations(left_contig, left_min_pos, left_max_pos, this.left_targets)
+    val (rightOverlaps, rightTargets) = annotations(right_contig, right_min_pos, right_max_pos, this.right_targets)
+    this.copy(
+      left_overlaps_target  = leftOverlaps,
+      right_overlaps_target = rightOverlaps,
+      left_targets          = leftTargets,
+      right_targets         = rightTargets
+    )
   }
 
   private def overlapsTarget(contig: String, start: Int, end: Int, targets: OverlapDetector[BEDFeature]): Boolean = {
@@ -448,6 +484,8 @@ object AggregatedBreakpointPileup {
           total          = head.total,
           left_pileups   = PositionList(head.left_pos),
           right_pileups  = PositionList(head.right_pos),
+          left_targets   = head.left_targets,
+          right_targets  = head.right_targets,
         )
         tail.foldLeft[AggregatedBreakpointPileup](headAgg)((agg, nextPileup) => agg.add(nextPileup))
       case _ => throw new IllegalArgumentException("Pileup group must be non-empty")

@@ -216,8 +216,8 @@ class SvPileup
     val breakpoints = tracker.breakpoints.toIndexedSeq.sortWith(Breakpoint.PairedOrdering.lt)
 
     breakpoints.foreach { bp =>
-      val leftTargets  = targets.map(_.getOverlaps(bp.leftInterval(dict)).map(_.getName).toSeq.sorted.distinct.mkString(","))
-      val rightTargets = targets.map(_.getOverlaps(bp.rightInterval(dict)).map(_.getName).toSeq.sorted.distinct.mkString(","))
+      val leftTargets  = targets.map(_.getOverlaps(bp.leftInterval(dict)).map(_.getName).toSeq.sorted.distinct.mkString(",")).flatMap(s => if (s.isEmpty) None else Some(s))
+      val rightTargets = targets.map(_.getOverlaps(bp.rightInterval(dict)).map(_.getName).toSeq.sorted.distinct.mkString(",")).flatMap(s => if (s.isEmpty) None else Some(s))
       val info         = tracker(bp)
       val metric       = BreakpointPileup(
         id            = info.id.toString,

@@ -117,6 +117,7 @@ class AggregateSvPileupTest extends UnitSpec {
     split_reads  = 1,
     read_pairs   = 1,
     total        = 2,
+    left_targets = Some("1,2,5")
   )
 
   private val pileup_id8_1_500_plus_3_400_plus = BreakpointPileup(
@@ -210,7 +211,7 @@ class AggregateSvPileupTest extends UnitSpec {
 
   private val bed = makeTempFile("targets", ".bed")
   private val bedWriter = new BufferedWriter(new FileWriter(bed.toFile))
-  bedWriter.write("#header\nchr1\t400\t410\nchr3\t401\t402\n")
+  bedWriter.write("#header\nchr1\t400\t410\ttarget-1\nchr3\t401\t402\ttarget-2\n")
   bedWriter.flush()
   bedWriter.close()
 
@@ -288,6 +289,7 @@ class AggregateSvPileupTest extends UnitSpec {
       total          = 12,
       left_pileups   = PositionList(100, 200, 300, 400, 500, 600),
       right_pileups  = PositionList(100, 200, 200, 300, 400, 500),
+      left_targets   = Some("1,2,5")
     )
 
     // pileup_id222_1_100_plus_1_200_plus does not combine due to different chromosome
@@ -380,6 +382,8 @@ class AggregateSvPileupTest extends UnitSpec {
       right_frequency = Some(8d/8),
       left_overlaps_target = true,
       right_overlaps_target = true,
+      left_targets = Some("target-1"),
+      right_targets = Some("target-2"),
     )
 
     val expAgg2 = AggregatedBreakpointPileup(
@@ -417,7 +421,6 @@ class AggregateSvPileupTest extends UnitSpec {
       pileup_id10_1_525_plus_3_425_plus,
     )
 
-
     val expAgg = AggregatedBreakpointPileup(
       id             = "10_7_8_9",
       category       = "Inter-contig rearrangement",
@@ -436,6 +439,7 @@ class AggregateSvPileupTest extends UnitSpec {
       right_pileups  = PositionList(300, 400, 425, 500),
       left_frequency = Some(8d/16),
       right_frequency = Some(8d/8),
+      left_targets   = Some("1,2,5")
     )
 
     val aggregatedPileups = runEndToEnd(pileups, bam = Some(bam))
@@ -471,6 +475,8 @@ class AggregateSvPileupTest extends UnitSpec {
       right_frequency = None,
       left_overlaps_target = true,
       right_overlaps_target = true,
+      left_targets = Some("target-1"),
+      right_targets = Some("target-2"),
     )
 
     val aggregatedPileups = runEndToEnd(pileups, targets = Some(bed))