Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AVOCADO] Update ADAM dependency to 0.26.0. #307

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,11 @@ class BiallelicGenotyper(

// load reads
val projection = Some(Filter(AlignmentRecordField.attributes,
AlignmentRecordField.origQual,
AlignmentRecordField.recordGroupName))
AlignmentRecordField.originalQuality,
AlignmentRecordField.readGroupId))
val reads = sc.loadAlignments(args.inputPath,
optProjection = projection)
val samples = reads.recordGroups.recordGroups.map(_.sample).toSet
val samples = reads.readGroups.readGroups.map(_.sampleId).toSet
require(samples.nonEmpty,
"Didn't see any samples attached to input. Did you forget to add read groups?")
require(samples.size <= 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class MergeDiscovered(
sc.loadVariants(args.inputPath)
.transformDataset(_.dropDuplicates("start",
"end",
"contigName",
"referenceName",
"referenceAllele",
"alternateAllele"))
.saveAsParquet(args.outputPath)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.bdgenomics.avocado.cli
import org.apache.spark.SparkContext
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs
import org.bdgenomics.adam.rdd.read.{ AlignmentRecordRDD, MDTagging }
import org.bdgenomics.adam.rdd.read.{ AlignmentRecordDataset, MDTagging }
import org.bdgenomics.avocado.realigner.Realigner
import org.bdgenomics.utils.cli._
import org.kohsuke.args4j.{ Argument, Option => Args4jOption }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import org.apache.spark.SparkContext
import org.bdgenomics.adam.projections.{ AlignmentRecordField, Filter }
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMSaveAnyArgs
import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.adam.rdd.variant.GenotypeRDD
import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
import org.bdgenomics.adam.rdd.variant.GenotypeDataset
import org.bdgenomics.avocado.genotyping.{
BiallelicGenotyper => Biallelic,
DiscoverVariants => Discover,
Expand Down Expand Up @@ -198,8 +198,8 @@ class TrioGenotyper(

// load reads
val projection = Some(Filter(AlignmentRecordField.attributes,
AlignmentRecordField.origQual,
AlignmentRecordField.recordGroupName))
AlignmentRecordField.originalQuality,
AlignmentRecordField.readGroupId))
val firstParentReads = sc.loadAlignments(args.firstParentPath,
optProjection = projection)
val secondParentReads = sc.loadAlignments(args.secondParentPath,
Expand Down Expand Up @@ -250,7 +250,7 @@ class TrioGenotyper(
copyNumber,
false)

val genotypes = GenotypeRDD(sc.union(firstParentGenotypes.rdd,
val genotypes = GenotypeDataset(sc.union(firstParentGenotypes.rdd,
secondParentGenotypes.rdd,
childGenotypes.rdd),
variants.sequences,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.IntegerType
import org.bdgenomics.adam.models.ReferenceRegion
import org.bdgenomics.adam.rdd.GenomeBins
import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
import org.bdgenomics.adam.rdd.variant.{
GenotypeRDD,
VariantRDD
GenotypeDataset,
VariantDataset
}
import org.bdgenomics.adam.util.PhredUtils
import org.bdgenomics.avocado.Timers._
Expand Down Expand Up @@ -85,21 +85,21 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
* @param maxMapQ The highest mapping quality to allow.
* @return Returns genotype calls.
*/
def call(reads: AlignmentRecordRDD,
variants: VariantRDD,
def call(reads: AlignmentRecordDataset,
variants: VariantDataset,
copyNumber: CopyNumberMap,
scoreAllSites: Boolean,
optDesiredPartitionCount: Option[Int] = None,
optDesiredPartitionSize: Option[Int] = None,
optDesiredMaxCoverage: Option[Int] = None,
maxQuality: Int = 93,
maxMapQ: Int = 93): GenotypeRDD = CallGenotypes.time {
maxMapQ: Int = 93): GenotypeDataset = CallGenotypes.time {

// validate metadata
require(variants.sequences.isCompatibleWith(reads.sequences),
"Variant sequence dictionary (%s) is not compatible with read dictionary (%s).".format(
variants.sequences, reads.sequences))
val samples = reads.recordGroups.recordGroups.map(_.sample).toSet
val samples = reads.readGroups.readGroups.map(_.sampleId).toSet
require(samples.size == 1,
"Currently, we only support a single sample. Saw: %s.".format(
samples.mkString(", ")))
Expand All @@ -124,11 +124,11 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
val genotypeRdd = observationsToGenotypes(observationRdd,
samples.head)

GenotypeRDD(genotypeRdd,
GenotypeDataset(genotypeRdd,
variants.sequences,
samples.map(s => {
Sample.newBuilder()
.setSampleId(s)
.setId(s)
.setName(s)
.build()
}).toSeq, org.bdgenomics.adam.converters.DefaultHeaderLines.allHeaderLines)
Expand All @@ -153,7 +153,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
* @param maxMapQ The highest mapping quality to allow.
* @return Returns genotype calls.
*/
def discoverAndCall(reads: AlignmentRecordRDD,
def discoverAndCall(reads: AlignmentRecordDataset,
copyNumber: CopyNumberMap,
scoreAllSites: Boolean,
optDesiredPartitionCount: Option[Int] = None,
Expand All @@ -162,7 +162,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
optDesiredPartitionSize: Option[Int] = None,
optDesiredMaxCoverage: Option[Int] = None,
maxQuality: Int = 93,
maxMapQ: Int = 93): GenotypeRDD = {
maxMapQ: Int = 93): GenotypeDataset = {

// get rdd storage level and warn if not persisted
val readSl = reads.rdd.getStorageLevel
Expand Down Expand Up @@ -442,7 +442,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {

// flatten schema
val flatFields = Seq(
observationsDf("_1.contigName").as("contigName"),
observationsDf("_1.referenceName").as("referenceName"),
observationsDf("_1.start").as("start"),
observationsDf("_1.referenceAllele").as("referenceAllele"),
observationsDf("_1.alternateAllele").as("alternateAllele"),
Expand Down Expand Up @@ -493,14 +493,14 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
sum("totalCoverage").as("totalCoverage"),
first("isRef").as("isRef"),
first("copyNumber").as("copyNumber"))
val aggregatedObservationsDf = joinedObservationsDf.groupBy("contigName",
val aggregatedObservationsDf = joinedObservationsDf.groupBy("referenceName",
"start",
"referenceAllele",
"alternateAllele")
.agg(aggCols.head, aggCols.tail: _*)

// re-nest the output
val firstField = struct(aggregatedObservationsDf("contigName"),
val firstField = struct(aggregatedObservationsDf("referenceName"),
aggregatedObservationsDf("start"),
aggregatedObservationsDf("referenceAllele"),
aggregatedObservationsDf("alternateAllele"))
Expand Down Expand Up @@ -733,7 +733,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
.setVariantCallingAnnotations(vcAnnotations)
.setStart(v.getStart)
.setEnd(v.getEnd)
.setContigName(v.getContigName)
.setReferenceName(v.getReferenceName)
.setSampleId(sample)
.setStrandBiasComponents(sbComponents
.map(i => i: java.lang.Integer))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package org.bdgenomics.avocado.genotyping

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD
import org.bdgenomics.adam.rdd.variant.VariantRDD
import org.bdgenomics.adam.rdd.read.AlignmentRecordDataset
import org.bdgenomics.adam.rdd.variant.VariantDataset
import org.bdgenomics.avocado.Timers._
import org.bdgenomics.avocado.models.{
Clipped,
Expand All @@ -41,19 +41,19 @@ import scala.annotation.tailrec
object DiscoverVariants extends Serializable with Logging {

/**
* Discovers all variants in an RDD of reads.
* Discovers all variants in an dataset of reads.
*
* @param aRdd RDD of reads.
* @param aRdd Dataset of reads.
* @param optPhredThreshold An optional threshold that discards all variants
* not supported by bases of at least a given phred score.
* @return Returns an RDD of variants.
* @return Returns a dataset of variants.
*/
private[avocado] def apply(
aRdd: AlignmentRecordRDD,
aRdd: AlignmentRecordDataset,
optPhredThreshold: Option[Int] = None,
optMinObservations: Option[Int] = None): VariantRDD = DiscoveringVariants.time {
optMinObservations: Option[Int] = None): VariantDataset = DiscoveringVariants.time {

VariantRDD(variantsInRdd(aRdd.rdd,
VariantDataset(variantsInRdd(aRdd.rdd,
optPhredThreshold = optPhredThreshold,
optMinObservations = optMinObservations),
aRdd.sequences,
Expand Down Expand Up @@ -87,7 +87,7 @@ object DiscoverVariants extends Serializable with Logging {
val uniqueVariants = optMinObservations.fold({
variantDs.distinct
})(mo => {
variantDs.groupBy(variantDs("contigName"),
variantDs.groupBy(variantDs("referenceName"),
variantDs("start"),
variantDs("referenceAllele"),
variantDs("alternateAllele"))
Expand Down Expand Up @@ -132,8 +132,8 @@ object DiscoverVariants extends Serializable with Logging {

// get the read sequence, contig, etc
val sequence = read.getSequence
val qual = read.getQual
val contigName = read.getContigName
val qual = read.getQuality
val referenceName = read.getReferenceName

// advance to the first alignment match
@tailrec def fastForward(
Expand Down Expand Up @@ -198,7 +198,7 @@ object DiscoverVariants extends Serializable with Logging {
val newVars = (0 until length).flatMap(i => {
if (qual(i).toInt - 33 >= phredThreshold) {
Some(DiscoveredVariant(
contigName,
referenceName,
pos + i,
ref(i).toString,
sequence(idx + i).toString))
Expand All @@ -216,7 +216,7 @@ object DiscoverVariants extends Serializable with Logging {
val insQuals = qual.substring(idx - 1, idx + length).map(_.toInt - 33).sum / length
val newVar = if (insQuals >= phredThreshold) {
DiscoveredVariant(
contigName,
referenceName,
pos - 1,
lastRef,
sequence.substring(idx - 1, idx + length)) :: variants
Expand All @@ -230,7 +230,7 @@ object DiscoverVariants extends Serializable with Logging {
val delLength = ref.size
val newVar = if (qual(idx - 1).toInt - 33 >= phredThreshold) {
DiscoveredVariant(
contigName,
referenceName,
pos - 1,
lastRef + ref,
sequence.substring(idx - 1, idx)) :: variants
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,25 @@ private[genotyping] object DiscoveredVariant {
* @return Returns a case class-based representation of the variant.
*/
def apply(variant: Variant): DiscoveredVariant = {
new DiscoveredVariant(variant.getContigName,
new DiscoveredVariant(variant.getReferenceName,
variant.getStart.toInt,
variant.getReferenceAllele,
Some(variant.getAlternateAllele))
}

/**
* @param contigName The contig this variant is on.
* @param referenceName The contig this variant is on.
* @param start The position this variant starts at.
* @param referenceAllele The reference allele this variant varies from.
* @param alternateAllele The substituted allele.
* @return Returns a discovered variant with a defined alternate allele.
*/
def apply(
contigName: String,
referenceName: String,
start: Int,
referenceAllele: String,
alternateAllele: String): DiscoveredVariant = {
new DiscoveredVariant(contigName, start, referenceAllele, Some(alternateAllele))
new DiscoveredVariant(referenceName, start, referenceAllele, Some(alternateAllele))
}

/**
Expand All @@ -64,13 +64,13 @@ private[genotyping] object DiscoveredVariant {
/**
* A variant site and alleles.
*
* @param contigName The contig this variant is on.
* @param referenceName The reference this variant is on.
* @param start The position this variant starts at.
* @param referenceAllele The reference allele this variant varies from.
* @param alternateAllele The substituted allele.
*/
case class DiscoveredVariant(
contigName: String,
referenceName: String,
start: Int,
referenceAllele: String,
alternateAllele: Option[String]) {
Expand All @@ -87,7 +87,7 @@ case class DiscoveredVariant(
*/
def toVariant: Variant = {
val builder = Variant.newBuilder
.setContigName(contigName)
.setReferenceName(referenceName)
.setStart(start.toLong)
.setEnd(end.toLong)
.setReferenceAllele(referenceAllele)
Expand All @@ -100,10 +100,10 @@ case class DiscoveredVariant(
}

def overlaps(v: DiscoveredVariant): Boolean = {
contigName == v.contigName && start < v.end && end > v.start
referenceName == v.referenceName && start < v.end && end > v.start
}

def overlaps(rr: ReferenceRegion): Boolean = {
contigName == rr.referenceName && start < rr.end && end > rr.start
referenceName == rr.referenceName && start < rr.end && end > rr.start
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import breeze.stats.distributions.Binomial
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.models.VariantContext
import org.bdgenomics.adam.rdd.variant.{
GenotypeRDD,
VariantContextRDD
GenotypeDataset,
VariantContextDataset
}
import org.bdgenomics.adam.util.PhredUtils
import org.bdgenomics.avocado.util.LogUtils
Expand All @@ -49,7 +49,7 @@ object JointAnnotatorCaller extends Serializable {
* @param genotypes The genotypes to jointly process.
* @return Returns a squared off and annotated set of variant contexts.
*/
def apply(genotypes: GenotypeRDD): VariantContextRDD = {
def apply(genotypes: GenotypeDataset): VariantContextDataset = {
apply(genotypes.toVariantContexts)
}

Expand All @@ -59,7 +59,7 @@ object JointAnnotatorCaller extends Serializable {
* @param variantContexts The squared off sites to process.
* @return Returns a squared off and annotated set of variant contexts.
*/
def apply(variantContexts: VariantContextRDD): VariantContextRDD = {
def apply(variantContexts: VariantContextDataset): VariantContextDataset = {
variantContexts.transform(_.flatMap(annotateSite))
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ private[genotyping] object Observer extends Serializable {

// for convenience, get the sample name, mapping quality, sequence,
// qualities, and the contig name
val sampleId = read.getRecordGroupSample
val contigName = read.getContigName
val mapQ = read.getMapq
val sampleId = read.getReadGroupSampleId
val referenceName = read.getReferenceName
val mapQ = read.getMappingQuality
val readSequence = read.getSequence
val readQualities = read.getQual
val readQualities = read.getQuality
val forwardStrand = !read.getReadNegativeStrand

// map over the alignment operators and generate allelic observations
Expand All @@ -73,7 +73,7 @@ private[genotyping] object Observer extends Serializable {
(0 until length).map(idx => {

// the key is the (site, allele, sampleId)
val key = (ReferenceRegion(contigName, pos, pos + 1),
val key = (ReferenceRegion(referenceName, pos, pos + 1),
readSequence(readIdx).toString,
sampleId)

Expand Down Expand Up @@ -104,7 +104,7 @@ private[genotyping] object Observer extends Serializable {

// the key is the (site, allele, sampleId)
// insertions associate to the site to their left, hence the -1
val key = (ReferenceRegion(contigName, pos - 1, pos),
val key = (ReferenceRegion(referenceName, pos - 1, pos),
bases,
sampleId)

Expand All @@ -124,7 +124,7 @@ private[genotyping] object Observer extends Serializable {

// the key is the (site, allele, sampleId)
// deletions have an empty string for the allele
val key = (ReferenceRegion(contigName, oldPos, pos),
val key = (ReferenceRegion(referenceName, oldPos, pos),
"",
sampleId)

Expand Down
Loading