Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.broadinstitute.hellbender.tools.funcotator.compositeoutput.CompositeOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.genelistoutput.GeneListOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.mafOutput.MafOutputRenderer;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
Expand Down Expand Up @@ -108,15 +109,19 @@ public FuncotatorEngine(final BaseFuncotatorArgumentCollection funcotatorArgs,
this.funcotatorArgs = funcotatorArgs;
inputMetadata = metadata;

// Determine whether we have to convert given variants from B37 to HG19:
mustConvertInputContigsToHg19 = determineReferenceAndDatasourceCompatibility();

dataSourceFactories = funcotationFactories;
// Note: The dataSourceFactories must be sorted to ensure that as we iterate through them
// to create funcotations, the inherent dependencies between different funcotation types are preserved.
// For example, most FuncotationFactories require that a GencodeFuncotation is present before they can
// create their annotations. This sorting enables such dependencies.
dataSourceFactories.sort(DataSourceUtils::datasourceComparator);

// Determine whether we have to convert given variants from B37 to HG19:
mustConvertInputContigsToHg19 = determineReferenceAndDatasourceCompatibility();
// Now set the GencodeFuncotationFactory to convert exon sequences to B37 for transcripts, if necessary:
// TODO: (NOTE:) This is a workaround for the B37 / HG19 problem. We really need to make B37 datasources.
dataSourceFactories.stream().filter(f -> f.getType().equals(FuncotatorArgumentDefinitions.DataSourceType.GENCODE)).forEach(f -> ((GencodeFuncotationFactory)f).setDoExonContigConversionToB37ForTranscripts(mustConvertInputContigsToHg19));

// Read in the custom variant classification order file here so that it can be shared across all engines:
if (funcotatorArgs.customVariantClassificationOrderFile != null) {
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package org.broadinstitute.hellbender.utils;

import htsjdk.samtools.util.Locatable;
import javassist.Loader;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.funcotator.FuncotatorUtils;

import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
* Class with utility methods for working with transcripts.
*/
public class TranscriptUtils {

//==================================================================================================================
// Public Static Members:

/**
* Extracts the transcript sequence from the reference context given the exons.
* The given exons are assumed to be all on the same contig.
* The exons will be sorted in the order they appear in the genome before extracting the sequences.
* @param refContext the reference context. Must not be {@code null}.
* @param exons the exons of the transcript. Must not be {@code null}.
* @return the transcript sequence as coded in the given reference context.
*/
public static final String extractTrascriptFromReference(final ReferenceContext refContext,
final List<Locatable> exons) {
return extractTrascriptFromReference(refContext, exons, false);
}

/**
* Extracts the transcript sequence from the reference context given the exons.
* The given exons are assumed to be all on the same contig.
* The exons will be sorted in the order they appear in the genome before extracting the sequences.
* @param refContext the reference context. Must not be {@code null}.
* @param exons the exons of the transcript. Must not be {@code null}.
* @param convertHg19ContigToB37Contig whether to convert the contig names from hg19 to b37.
* @return the transcript sequence as coded in the given reference context.
*/
public static final String extractTrascriptFromReference(final ReferenceContext refContext,
final List<Locatable> exons,
final boolean convertHg19ContigToB37Contig ) {

// TODO: THIS CONVERSION IS BAD. WE MUST REFACTOR THIS EVENTUALLY TO REMOVE THIS AND MAKE SOME B37 DATA SOURCES.
Utils.nonNull(refContext);
Utils.nonNull(exons);

final StringBuilder transcript = new StringBuilder();

// We should iterate through the list of exons in sorted order so we can simply append them together.
for (final Locatable exon : exons.stream().sorted(Comparator.comparingInt(Locatable::getStart).thenComparing(Locatable::getEnd)).toList() ) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again this ordering/overlapping assumption

final SimpleInterval exonInterval;

if ( convertHg19ContigToB37Contig ) {
exonInterval = new SimpleInterval(FuncotatorUtils.convertHG19ContigToB37Contig(exon.getContig()), exon.getStart(), exon.getEnd());

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might be a little cleaner to read with a ternary (alternatively make some helper method somewhere called "getConvertedContigName()" (that goes for GencodeFuncotationFactory as well which also threads this a bunch)

}
else {
exonInterval = new SimpleInterval(exon.getContig(), exon.getStart(), exon.getEnd());
}
transcript.append(new String(refContext.getBases(exonInterval)));
}
return transcript.toString();
}

//==================================================================================================================
// Private Static Members:

//==================================================================================================================
// Private Members:

//==================================================================================================================
// Constructors:

//==================================================================================================================
// Override Methods:

//==================================================================================================================
// Static Methods:

//==================================================================================================================
// Instance Methods:

//==================================================================================================================
// Helper Data Types:

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
*
* Created by jonn on 7/21/17.
*/
public abstract class GencodeGtfFeature implements Feature, Comparable<GencodeGtfFeature> {
public abstract class GencodeGtfFeature implements Feature, Locatable, Comparable<GencodeGtfFeature> {

private static final Logger logger = LogManager.getLogger(GencodeGtfFeature.class);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,8 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
MafOutputRendererConstants.FieldName_Reference_Allele, MafOutputRendererConstants.FieldName_Tumor_Seq_Allele1, MafOutputRendererConstants.FieldName_Tumor_Seq_Allele2, MafOutputRendererConstants.FieldName_Genome_Change, MafOutputRendererConstants.FieldName_Annotation_Transcript, MafOutputRendererConstants.FieldName_Transcript_Strand, MafOutputRendererConstants.FieldName_Transcript_Exon, MafOutputRendererConstants.FieldName_Transcript_Position, MafOutputRendererConstants.FieldName_cDNA_Change, MafOutputRendererConstants.FieldName_Codon_Change, MafOutputRendererConstants.FieldName_Protein_Change, MafOutputRendererConstants.FieldName_gc_content, MafOutputRendererConstants.FieldName_ref_context, MafOutputRendererConstants.FieldName_Other_Transcripts);

private static String hg38Chr3Ref;
private static String b37Chr3Ref;
private static String b37Chr2Ref;
private static String hg19Chr3Ref;
private static String b37Chr2Ref;
private static String hg19Chr19Ref;

private static String eColiRef;
Expand All @@ -126,9 +125,8 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
}

hg38Chr3Ref = FuncotatorReferenceTestUtils.retrieveHg38Chr3Ref();
b37Chr3Ref = FuncotatorReferenceTestUtils.retrieveB37Chr3Ref();
b37Chr2Ref = FuncotatorReferenceTestUtils.retrieveB37Chr2Ref();
hg19Chr3Ref = FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref();
b37Chr2Ref = FuncotatorReferenceTestUtils.retrieveB37Chr2Ref();
hg19Chr19Ref = FuncotatorReferenceTestUtils.retrieveHg19Chr19Ref();
eColiRef = FuncotatorReferenceTestUtils.retrieveEcoliReference();
}
Expand Down Expand Up @@ -749,7 +747,7 @@ public void testCanAnnotateMixedContigHg19Clinvar() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -788,7 +786,7 @@ public void testManualAnnotationsCorrectness() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -867,7 +865,7 @@ public void testXsvLocatableAnnotationsHaveOnlyOneEntryForMultiHitLocations() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
XSV_CLINVAR_MULTIHIT_TEST_VCF,
outputFile,
b37Chr2Ref,
b37Reference,
DS_XSV_CLINVAR_TESTS,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -897,7 +895,7 @@ public void testXsvLocatableAnnotationsHaveCorrectColsForOnlyOnePositionSpecifie
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
XSV_CLINVAR_COL_TEST_VCF,
outputFile,
b37Chr2Ref,
b37Reference,
DS_XSV_CLINVAR_TESTS,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1067,8 +1065,8 @@ public void testExclusionFromDatasourceVcfToVcf() {
@DataProvider(name = "provideForMafVcfConcordance")
final Object[][] provideForMafVcfConcordance() {
return new Object[][]{
{PIK3CA_VCF_HG19_SNPS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 15},
{PIK3CA_VCF_HG19_INDELS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 57},
{PIK3CA_VCF_HG19_SNPS, b37Reference, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 15},
{PIK3CA_VCF_HG19_INDELS, b37Reference, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), DS_PIK3CA_DIR, true, 57},
{MUC16_VCF_HG19, hg19Chr19Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, Collections.singletonList("Gencode_19_proteinChange"), Collections.singletonList(MafOutputRendererConstants.FieldName_Protein_Change), FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER, false, 2057},
{
PIK3CA_VCF_HG38,
Expand All @@ -1080,7 +1078,7 @@ final Object[][] provideForMafVcfConcordance() {
false,
104,
},
{PIK3CA_VCF_HG19_INDELS, b37Chr3Ref, FuncotatorTestConstants.REFERENCE_VERSION_HG19, VCF_FIELDS_GENCODE_19_DS, MAF_FIELDS_GENCODE_DS, DS_PIK3CA_DIR, true, 57},
{PIK3CA_VCF_HG19_INDELS, b37Reference, FuncotatorTestConstants.REFERENCE_VERSION_HG19, VCF_FIELDS_GENCODE_19_DS, MAF_FIELDS_GENCODE_DS, DS_PIK3CA_DIR, true, 57},
};
}

Expand Down Expand Up @@ -1325,7 +1323,7 @@ public void testVcfDatasourceAccountsForAltAlleles() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19_ALTS,
vcfOutputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
vcfOutputFormatType,
Expand Down Expand Up @@ -1463,7 +1461,7 @@ private AnnotatedIntervalCollection runPik3caHg19VcfToMaf(final Set<String> excl
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorArgumentDefinitions.OutputFormatType.MAF,
Expand Down Expand Up @@ -1502,7 +1500,7 @@ public void testVCFToVCFPreservesFields() {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
PIK3CA_VCF_HG19,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1555,7 +1553,7 @@ public void testMafCustomCountFields(final String tnVcf) {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
tnVcf,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1611,7 +1609,7 @@ public void testMafCustomCountFieldsTumorOnly(final String tnVcf) {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
tnVcf,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand Down Expand Up @@ -1665,7 +1663,7 @@ private void runSomaticVcf(final String vcfFile) {
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
vcfFile,
outputFile,
b37Chr3Ref,
b37Reference,
DS_PIK3CA_DIR,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
outputFormatType,
Expand All @@ -1685,7 +1683,7 @@ public void testSequenceDictionaryCheck() {
// The input VCF and Reference file are incompatible because
// the reference file dictionary has only chromosome 2 and the
// input VCF has a dictionary that contains all contigs for HG19.
// Therefore the reference dictionary is NOT a superset of the input VCF dictionary.
// Therefore, the reference dictionary is NOT a superset of the input VCF dictionary.

final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
XSV_CLINVAR_COL_TEST_VCF,
Expand Down Expand Up @@ -1747,7 +1745,7 @@ public void testNoVariantsProduceMaf() {

arguments.addVCF(new File(EMPTY_VCF));
arguments.addOutput(outputFile);
arguments.addReference(new File(b37Chr3Ref));
arguments.addReference(new File(b37Reference));
arguments.add(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, DS_PIK3CA_DIR);
arguments.add(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19);
arguments.add(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString());
Expand Down Expand Up @@ -1777,7 +1775,7 @@ public void testEnsureDbSnpInMaf() {

arguments.addVCF(new File(MAF_DBSNP_TEST));
arguments.addOutput(outputFile);
arguments.addReference(new File(b37Chr3Ref));
arguments.addReference(new File(b37Reference));
arguments.add(FuncotatorArgumentDefinitions.DATA_SOURCES_PATH_LONG_NAME, PIK3CA_DBSNP_DS);
arguments.add(FuncotatorArgumentDefinitions.REFERENCE_VERSION_LONG_NAME, FuncotatorTestConstants.REFERENCE_VERSION_HG19);
arguments.add(FuncotatorArgumentDefinitions.OUTPUT_FORMAT_LONG_NAME, outputFormatType.toString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1353,8 +1353,7 @@ void testGetVariantClassificationForCodingRegions(final int chromosomeNumber,
transcript,
exonPositionList,
muc16TranscriptIdMap,
muc16TranscriptDataSource,
true);
false);

final GencodeFuncotation.VariantClassification varClass = GencodeFuncotationFactory.createVariantClassification(
variantContext,
Expand Down Expand Up @@ -1442,8 +1441,7 @@ void testSpliceSiteWindowSettings(final int chromosomeNumber,
transcript,
exonPositionList,
muc16TranscriptIdMap,
muc16TranscriptDataSource,
true);
false);

final GencodeFuncotation.VariantClassification varClass = GencodeFuncotationFactory.createVariantClassification(
variantContext,
Expand Down Expand Up @@ -2739,10 +2737,23 @@ public void testFivePrimeFlankSorting() {
new FlankSettings(5000, 0),
"TEST")) {

// We have to set the contig conversion because of the datasources / B37 mismatch:
funcotationFactory.setDoExonContigConversionToB37ForTranscripts(true);

final List<Funcotation> gencodeFuncotationList = funcotationFactory.createFuncotations(vcHg19, referenceContext, featureContext);

System.out.println(gencodeFuncotationList.size());
}
}

// @Test
// public void testGetCodingSequenceFromTranscriptFasta() {
//
//
// getCodingSequenceFromReferenceContext();
//
// final String transcriptFastaFile = FuncotatorTestConstants.PIK3CA_ALL_TRANSCRIPTS_GENCODE_TRANSCRIPT_FASTA_FILE;
// final String transcriptId = "ENST00000263967.1";
// final String expectedCodingSequence = "ATGAGTGGGAGCTGCT";
// }
}
Loading
Loading