Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,12 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.recalibration.*;
import org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList;
import org.broadinstitute.hellbender.utils.recalibration.covariates.BQSRCovariateList;
import org.broadinstitute.hellbender.utils.reference.ReferenceBases;
import org.broadinstitute.hellbender.utils.variant.GATKVariant;
import scala.Tuple2;

import java.util.Arrays;

public final class BaseRecalibratorSparkFn {

Expand All @@ -34,7 +38,7 @@ public static RecalibrationReport apply(final JavaPairRDD<GATKRead, Iterable<GAT
return Iterators.singletonIterator(bqsr.getRecalibrationTables());
});

final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(recalArgs, header));
final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new BQSRCovariateList(recalArgs, header)); // tsato: Potentially here!
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what this comment means - but please remove all of the tsato comments. I haven't yet looked at the RecalibrationTables class, but if there is an outstanding issue let me know what it is.

final RecalibrationTables combinedTables = unmergedTables.treeAggregate(emptyRecalibrationTable,
RecalibrationTables::inPlaceCombine,
RecalibrationTables::inPlaceCombine,
Expand All @@ -45,7 +49,7 @@ public static RecalibrationReport apply(final JavaPairRDD<GATKRead, Iterable<GAT

final QuantizationInfo quantizationInfo = new QuantizationInfo(combinedTables, recalArgs.QUANTIZING_LEVELS);

final StandardCovariateList covariates = new StandardCovariateList(recalArgs, header);
final BQSRCovariateList covariates = new BQSRCovariateList(recalArgs, header);
return RecalUtils.createRecalibrationReport(recalArgs.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), RecalUtils.generateReportTables(combinedTables, covariates));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ public final class ApplyBQSR extends ReadWalker{
*/
@ArgumentCollection
public ApplyBQSRArgumentCollection bqsrArgs = new ApplyBQSRArgumentCollection();


private BQSRReadTransformer bqsrTransformer;
private SAMFileGATKReadWriter outputWriter;

/**
Expand All @@ -111,7 +112,12 @@ public ReadTransformer makePostReadFilterTransformer(){
@Override
public void onTraversalStart() {
outputWriter = createSAMWriter(output, true);
bqsrTransformer = new BQSRReadTransformer(getHeaderForReads(), bqsrRecalFile, bqsrArgs);
Utils.warnOnNonIlluminaReadGroups(getHeaderForReads(), logger);

logger.info(String.format("Loaded covariates %s from recalibration table %s",
bqsrTransformer.getCovariates().covariateNames(),
bqsrRecalFile.getAbsolutePath()));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.broadinstitute.hellbender.utils.recalibration.QuantizationInfo;
import org.broadinstitute.hellbender.utils.recalibration.RecalUtils;
import org.broadinstitute.hellbender.utils.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.hellbender.utils.recalibration.covariates.BQSRCovariateList;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.io.PrintStream;
Expand Down Expand Up @@ -139,6 +140,11 @@ public boolean requiresReference() {
*/
@Override
public void onTraversalStart() {
if ( recalArgs.LIST_ONLY ) {
logger.info("Available covariates: " + BQSRCovariateList.getAllDiscoveredCovariateNames());
System.exit(0);
}

if (recalArgs.FORCE_PLATFORM != null) {
recalArgs.DEFAULT_PLATFORM = recalArgs.FORCE_PLATFORM;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import org.broadinstitute.hellbender.utils.recalibration.covariates.CovariateKeyCache;
import org.broadinstitute.hellbender.utils.recalibration.covariates.PerReadCovariateMatrix;
import org.broadinstitute.hellbender.utils.recalibration.covariates.ReadGroupCovariate;
import org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList;
import org.broadinstitute.hellbender.utils.recalibration.covariates.BQSRCovariateList;

import java.io.File;
import java.util.Arrays;
Expand All @@ -30,7 +30,7 @@ public final class BQSRReadTransformer implements ReadTransformer {
private static final long serialVersionUID = 1L;

private final RecalibrationTables recalibrationTables;
private final StandardCovariateList covariates; // list of all covariates to be used in this calculation
private final BQSRCovariateList covariates; // list of all covariates to be used in this calculation
private final SAMFileHeader header;

private final int preserveQLessThan;
Expand All @@ -39,12 +39,11 @@ public final class BQSRReadTransformer implements ReadTransformer {

//These fields are created to avoid redoing these calculations for every read
private final int totalCovariateCount;
private final int specialCovariateCount;

private static final int BASE_SUBSTITUTION_INDEX = EventType.BASE_SUBSTITUTION.ordinal();
public static final int BASE_SUBSTITUTION_INDEX = EventType.BASE_SUBSTITUTION.ordinal();

//Note: varargs allocates a new array every time. We'll pre-allocate one array and reuse it to avoid object allocation for every base of every read.
private final RecalDatum[] recalDatumsForSpecialCovariates;
private final RecalDatum[] recalDatumsForAdditionalCovariates;
private final boolean useOriginalBaseQualities;

// TODO: this should be merged with the quantized quals
Expand Down Expand Up @@ -73,13 +72,14 @@ public BQSRReadTransformer(final SAMFileHeader header, final File bqsrRecalFile,
* @param header header for the reads
* @param recalibrationTables recalibration tables output from BQSR
* @param quantizationInfo quantization info
* @param covariates standard covariate set
* @param bqsrCovariateList the list of covariate objects
* @param args ApplyBQSR arguments
*/
private BQSRReadTransformer(final SAMFileHeader header, final RecalibrationTables recalibrationTables, final QuantizationInfo quantizationInfo, final StandardCovariateList covariates, final ApplyBQSRArgumentCollection args) {
private BQSRReadTransformer( final SAMFileHeader header, final RecalibrationTables recalibrationTables, final QuantizationInfo quantizationInfo,
final BQSRCovariateList bqsrCovariateList, final ApplyBQSRArgumentCollection args) {
this.header = header;
this.recalibrationTables = recalibrationTables;
this.covariates = covariates;
this.covariates = bqsrCovariateList;

if (args.quantizationLevels == 0) { // quantizationLevels == 0 means no quantization, preserve the quality scores
quantizationInfo.noQuantization();
Expand All @@ -99,11 +99,11 @@ private BQSRReadTransformer(final SAMFileHeader header, final RecalibrationTable
}

this.quantizedQuals = quantizationInfo.getQuantizedQuals();
this.totalCovariateCount = covariates.size();
this.specialCovariateCount = covariates.numberOfSpecialCovariates();
this.totalCovariateCount = bqsrCovariateList.size();
final int additionalCovariateCount = bqsrCovariateList.getAdditionalCovariates().size();

//Note: We pre-create the varargs arrays that will be used in the calls. Otherwise we're spending a lot of time allocating those int[] objects
this.recalDatumsForSpecialCovariates = new RecalDatum[specialCovariateCount];
this.recalDatumsForAdditionalCovariates = new RecalDatum[additionalCovariateCount];
this.keyCache = new CovariateKeyCache();//one cache per transformer
this.allowMissingReadGroups = args.allowMissingReadGroups;
}
Expand All @@ -119,22 +119,26 @@ public BQSRReadTransformer(final SAMFileHeader header, final RecalibrationReport
this(header, recalInfo.getRecalibrationTables(), recalInfo.getQuantizationInfo(), recalInfo.getCovariates(), args);
}

public BQSRCovariateList getCovariates() {
return covariates;
}

/**
* Recalibrates the base qualities of a read
* <p>
* It updates the base qualities of the read with the new recalibrated qualities (for all event types)
* <p>
* Implements a serial recalibration of the reads using the combinational table.
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
* First, we perform a positional recalibration, and then a subsequent dinucleotide (dinuc) correction.
* <p>
* Given the full recalibration table, we perform the following preprocessing steps:
* <p>
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Q_empirical(pos, qual, dinuc) - Q_reported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
* <p>
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* Q_recal = Q_reported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* @param originalRead the read to recalibrate
*/
Expand Down Expand Up @@ -203,26 +207,26 @@ public GATKRead apply(final GATKRead originalRead) {
}

// clear and reuse this array to save space
Arrays.fill(recalDatumsForSpecialCovariates, null);
Arrays.fill(recalDatumsForAdditionalCovariates, null);
final int[] covariatesAtOffset = covariatesForRead[offset];
final int reportedBaseQualityAtOffset = covariatesAtOffset[StandardCovariateList.BASE_QUALITY_COVARIATE_DEFAULT_INDEX];
final int reportedBaseQualityAtOffset = covariatesAtOffset[BQSRCovariateList.BASE_QUALITY_COVARIATE_DEFAULT_INDEX];

// Datum for the tuple (read group, reported quality score).
final RecalDatum qualityScoreDatum = recalibrationTables.getQualityScoreTable()
.get3Keys(rgKey, reportedBaseQualityAtOffset, BASE_SUBSTITUTION_INDEX);

for (int j = StandardCovariateList.NUM_REQUIRED_COVARITES; j < totalCovariateCount; j++) {
for (int j = BQSRCovariateList.NUM_REQUIRED_COVARITES; j < totalCovariateCount; j++) {
// If the covariate is -1 (e.g. the first base in each read should have -1 for the context covariate),
// we simply leave the corresponding Datum to be null, which will subsequently be ignored when it comes time to recalibrate.
if (covariatesAtOffset[j] >= 0) {
recalDatumsForSpecialCovariates[j - StandardCovariateList.NUM_REQUIRED_COVARITES] = recalibrationTables.getTable(j)
recalDatumsForAdditionalCovariates[j - BQSRCovariateList.NUM_REQUIRED_COVARITES] = recalibrationTables.getTable(j)
.get4Keys(rgKey, reportedBaseQualityAtOffset, covariatesAtOffset[j], BASE_SUBSTITUTION_INDEX);
}
}

// Use the reported quality score of the read group as the prior, which can be non-integer because of collapsing.
final double priorQualityScore = constantQualityScorePrior > 0.0 ? constantQualityScorePrior : readGroupDatum.getReportedQuality();
final double rawRecalibratedQualityScore = hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, recalDatumsForSpecialCovariates);
final double rawRecalibratedQualityScore = hierarchicalBayesianQualityEstimate(priorQualityScore, readGroupDatum, qualityScoreDatum, recalDatumsForAdditionalCovariates);
final byte quantizedQualityScore = quantizedQuals.get(getBoundedIntegerQual(rawRecalibratedQualityScore));

// TODO: as written the code quantizes *twice* if the static binning is enabled (first time to the dynamic bin). It should be quantized once.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import org.broadinstitute.hellbender.utils.recalibration.covariates.Covariate;
import org.broadinstitute.hellbender.utils.recalibration.covariates.CovariateKeyCache;
import org.broadinstitute.hellbender.utils.recalibration.covariates.PerReadCovariateMatrix;
import org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList;
import org.broadinstitute.hellbender.utils.recalibration.covariates.BQSRCovariateList;

import java.io.Serializable;
import java.util.Arrays;
Expand Down Expand Up @@ -68,7 +68,7 @@ public SimpleInterval apply( GATKRead read ) {
/**
* list to hold the all the covariate objects that were requested (required + standard + experimental)
*/
private StandardCovariateList covariates;
private BQSRCovariateList covariates;

private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector
private static final byte NO_BAQ_UNCERTAINTY = (byte)'@';
Expand All @@ -90,7 +90,7 @@ public BaseRecalibrationEngine( final RecalibrationArgumentCollection recalArgs,
baq = null;
}

covariates = new StandardCovariateList(recalArgs, readsHeader);
covariates = new BQSRCovariateList(recalArgs, readsHeader);

final int numReadGroups = readsHeader.getReadGroups().size();
if ( numReadGroups < 1 ) {
Expand Down Expand Up @@ -236,7 +236,7 @@ public RecalibrationTables getFinalRecalibrationTables() {
return recalTables;
}

public StandardCovariateList getCovariates() {
public BQSRCovariateList getCovariates() {
return covariates;
}

Expand All @@ -260,6 +260,7 @@ private void updateRecalTablesForRead( final ReadRecalibrationInfo recalInfo ) {
final NestedIntegerArray<RecalDatum> qualityScoreTable = recalTables.getQualityScoreTable();

final int nCovariates = covariates.size();
final int nSpecialCovariates = BQSRCovariateList.numberOfRequiredCovariates();
final int readLength = read.getLength();
for( int offset = 0; offset < readLength; offset++ ) {
if( ! recalInfo.skip(offset) ) {
Expand All @@ -270,8 +271,8 @@ private void updateRecalTablesForRead( final ReadRecalibrationInfo recalInfo ) {
final byte qual = recalInfo.getQual(eventType, offset);
final double isError = recalInfo.getErrorFraction(eventType, offset);

final int readGroup = covariatesAtOffset[StandardCovariateList.READ_GROUP_COVARIATE_DEFAULT_INDEX];
final int baseQuality = covariatesAtOffset[StandardCovariateList.BASE_QUALITY_COVARIATE_DEFAULT_INDEX];
final int readGroup = covariatesAtOffset[BQSRCovariateList.READ_GROUP_COVARIATE_DEFAULT_INDEX];
final int baseQuality = covariatesAtOffset[BQSRCovariateList.BASE_QUALITY_COVARIATE_DEFAULT_INDEX];

RecalUtils.incrementDatum3keys(qualityScoreTable, qual, isError, readGroup, baseQuality, eventIndex);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ public boolean skip(final int offset) {
}

/**
* Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets
* @return a non-null ReadCovariates object
* Get the PerReadCovariateMatrix object carrying the mapping from offsets -> covariate key sets
* @return a non-null PerReadCovariateMatrix object
*/
public PerReadCovariateMatrix getCovariatesValues() {
return covariates;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,4 +365,31 @@ protected static double getLogBinomialLikelihood(final double qualityScore, long
final double logLikelihood = MathUtils.logBinomialProbability((int) nObservations, (int) nErrors, QualityUtils.qualToErrorProb(qualityScore));
return ( Double.isInfinite(logLikelihood) || Double.isNaN(logLikelihood) ) ? -Double.MAX_VALUE : logLikelihood;
}

@Override
public boolean equals(Object obj) {
if (this == obj){
return true;
}

if (!(obj instanceof RecalDatum) || obj == null) {
return false;
}

RecalDatum other = (RecalDatum) obj;
return numObservations == other.numObservations
&& Double.compare(numMismatches, other.numMismatches) == 0
&& Double.compare(reportedQuality, other.reportedQuality) == 0
&& empiricalQuality == other.empiricalQuality;
}

@Override
public int hashCode() {
int result = 17;
result = 31 * result + Long.hashCode(numObservations);
result = 31 * result + Double.hashCode(numMismatches);
result = 31 * result + Double.hashCode(reportedQuality);
result = 31 * result + Integer.hashCode(empiricalQuality);
return result;
}
}
Loading
Loading