Skip to content

Commit ddfc1a6

Browse files
committed
fix bugs
1 parent 98c94da commit ddfc1a6

File tree

10 files changed

+99
-22
lines changed

10 files changed

+99
-22
lines changed

src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants.java

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@
2828
* Left-align indels in a variant callset
2929
*
3030
* <p>
31-
* This tool takes a VCF file, left-aligns the indels and trims common bases from indels,
31+
* This tool takes a VCF file, left-aligns the indels and trims common bases from all variants,
3232
* leaving them with a minimum representation. The same indel can often be placed at multiple positions and still
3333
* represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position
3434
* this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic
35-
* sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.
35+
* sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from variants.
3636
* </p>
3737
*
3838
* <h3>Input</h3>
@@ -172,8 +172,10 @@ public class LeftAlignAndTrimVariants extends VariantWalker {
172172
private VariantContextWriter vcfWriter = null;
173173
private VCFHeader vcfHeader = null;
174174

175-
VariantContext lastVariant;
176-
175+
private int thisVariantGroupStart = 0;
176+
private String thisVariantGroupContig = null;
177+
private VariantContext lastVariantWritten = null;
178+
private final List<VariantContext> realignedVariants = new ArrayList<>();
177179
@Override
178180
public void onTraversalStart() {
179181
final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
@@ -213,6 +215,17 @@ private Set<VCFHeaderLine> createVCFHeaderLineList(Map<String, VCFHeader> vcfHea
213215
*/
214216
@Override
215217
public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) {
218+
if (vc.getContig() != thisVariantGroupContig || vc.getStart() > thisVariantGroupStart) {
219+
realignedVariants.sort(Comparator.comparingInt(VariantContext::getStart));
220+
for (VariantContext realignedVariant : realignedVariants) {
221+
vcfWriter.add(realignedVariant);
222+
}
223+
thisVariantGroupStart = vc.getStart();
224+
thisVariantGroupContig = vc.getContig();
225+
lastVariantWritten = realignedVariants.isEmpty() ? lastVariantWritten : realignedVariants.get(realignedVariants.size() - 1);
226+
realignedVariants.clear();
227+
}
228+
216229
final List<VariantContext> vcList;
217230
if (splitMultiallelics) {
218231
if (vc.getGenotypes().stream().anyMatch(g -> g.hasAnyAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))) {
@@ -231,12 +244,10 @@ public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext
231244
if (indelLength > maxIndelSize) {
232245
logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --max-indel-length >= %d",
233246
"Indel is too long", indelLength, splitVariant.getContig(), splitVariant.getStart(), indelLength));
234-
lastVariant = splitVariant;
235-
vcfWriter.add(splitVariant);
247+
realignedVariants.add(splitVariant);
236248
} else {
237-
final int distanceToLastVariant = (lastVariant != null && splitVariant.contigsMatch(lastVariant)) ? splitVariant.getStart() - lastVariant.getEnd() : Integer.MAX_VALUE;
238-
lastVariant = GATKVariantContextUtils.leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant - 1), !dontTrimAlleles);
239-
vcfWriter.add(lastVariant);
249+
final int distanceToLastVariant = (lastVariantWritten != null && splitVariant.contigsMatch(lastVariantWritten)) ? splitVariant.getStart() - lastVariantWritten.getEnd() : Integer.MAX_VALUE;
250+
realignedVariants.add(GATKVariantContextUtils.leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant), !dontTrimAlleles));
240251
}
241252
}
242253
}
@@ -256,6 +267,11 @@ public boolean requiresReference() {
256267
*/
257268
@Override
258269
public Object onTraversalSuccess() {
270+
//write out remaining variants
271+
realignedVariants.sort(Comparator.comparingInt(VariantContext::getStart));
272+
for (VariantContext realignedVariant : realignedVariants) {
273+
vcfWriter.add(realignedVariant);
274+
}
259275
return "SUCCESS";
260276
}
261277

src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVariantContextUtils.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2220,15 +2220,13 @@ public static <T> List<T> removeItemsByIndex(List<T> data, List<Integer> indexes
22202220
* By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be
22212221
* handled by calling routine.
22222222
*
2223-
* @param vc Input VC with variants to left align
2223+
* @param vc Input VC with variants to left align and trim
22242224
* @param ref Reference context
22252225
* @return new VC.
22262226
*/
2227-
public static VariantContext leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBases, final boolean trim) {
2228-
if (!vc.isIndel() || maxLeadingBases <= 0) {
2229-
return vc;
2230-
}
2227+
public static VariantContext leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBasesIndel, final boolean trim) {
22312228

2229+
final int maxLeadingBases = vc.isIndel() ? maxLeadingBasesIndel : 0;
22322230

22332231
for(int leadingBases = Math.min(maxLeadingBases, 10); leadingBases <= maxLeadingBases; leadingBases = Math.min(2*leadingBases, maxLeadingBases)) {
22342232
final int refStart = Math.max(vc.getStart() - leadingBases, 1);
@@ -2245,8 +2243,9 @@ public static VariantContext leftAlignAndTrim(final VariantContext vc, final Ref
22452243
return result;
22462244
}).collect(Collectors.toList());
22472245

2246+
final int boundStart = vc.isSNP() || vc.isMNP() ? variantOffsetInRef : variantOffsetInRef + 1; // +1 to ignore the shared base in front for indels
22482247
final List<IndexRange> alleleRanges = vc.getAlleles().stream()
2249-
.map(a -> new IndexRange(variantOffsetInRef + 1, variantOffsetInRef + a.length())) // +1 to ignore the shared base in front
2248+
.map(a -> new IndexRange(boundStart, variantOffsetInRef + a.length()))
22502249
.collect(Collectors.toList());
22512250

22522251
// note that this also shifts the index ranges as a side effect, so below they can be used to output allele bases

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38.vcf

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,12 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3535
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3636
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3737
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3939
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
40+
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
41+
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
42+
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
43+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
44+
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
45+
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
46+
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38_maxIndelSize296.vcf

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,12 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3535
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3636
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3737
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3939
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
40+
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
41+
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
42+
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
43+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
44+
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
45+
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
46+
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38_maxIndelSize342.vcf

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,12 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3535
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3636
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3737
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3939
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
40+
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
41+
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
42+
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
43+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
44+
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
45+
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
46+
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38_notrim.vcf

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,12 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3535
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3636
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3737
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
38+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3939
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
40+
chr21 39584859 . CAA GAA 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
41+
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
42+
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
43+
chr21 39586239 . CAATT CAATTA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
44+
chr21 39586240 . AATTA AATT 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
45+
chr21 39586243 . TAAAA TAGAA 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
46+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38_notrim_split_multiallelics.vcf

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3737
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3838
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3939
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
40-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
40+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
4141
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
42+
chr21 39584859 . CAA GAA 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
43+
chr21 39584948 . A AC 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
44+
chr21 39584950 . C G 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
45+
chr21 39584951 . CA C 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
46+
chr21 39584953 . AA AG 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
47+
chr21 39586239 . CAATT CAATTA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
48+
chr21 39586240 . AATTA AATT 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
49+
chr21 39586243 . TAAAA TAGAA 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
50+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38_split_multiallelics.vcf

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3737
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3838
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3939
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
40-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
40+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
4141
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
42+
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
43+
chr21 39584948 . A AC 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
44+
chr21 39584950 . C G 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
45+
chr21 39584951 . CA C 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
46+
chr21 39584954 . A G 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
47+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
48+
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
49+
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
50+
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/expected_left_align_hg38_split_multiallelics_keepOrigAC.vcf

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
4040
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
4141
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
4242
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
43-
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
43+
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
4444
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
45+
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
46+
chr21 39584948 . A AC 30 . AC=1;AC_Orig=1;AF=0.250;AF_Orig=0.250;AN=4;AN_Orig=4 GT 0/0 0/1
47+
chr21 39584950 . C G 30 . AC=2;AC_Orig=2;AF=0.500;AF_Orig=0.500;AN=4;AN_Orig=4 GT 1/1 0/0
48+
chr21 39584951 . CA C 30 . AC=1;AC_Orig=1;AF=0.250;AF_Orig=0.250;AN=4;AN_Orig=4 GT 0/0 0/1
49+
chr21 39584954 . A G 30 . AC=2;AC_Orig=2;AF=0.500;AF_Orig=0.500;AN=4;AN_Orig=4 GT 1/1 0/0
50+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
51+
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
52+
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
53+
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0

src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/LeftAlignAndTrimVariants/test_left_align_hg38.vcf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,10 @@ chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
3737
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3838
chr21 13255301 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
3939
chr21 39584006 . CTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCC C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
40+
chr21 39584859 . CAA GAA 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
41+
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
42+
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
43+
chr21 39586243 . TAAAA TAGAA 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
44+
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
45+
chr21 39586243 . TAAAA TAAA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
46+
chr21 39586243 . TAAAA TAAAAA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1

0 commit comments

Comments
 (0)