Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@
* Left-align indels in a variant callset
*
* <p>
* This tool takes a VCF file, left-aligns the indels and trims common bases from indels,
* This tool takes a VCF file, left-aligns the indels and trims common bases from all variants,
* leaving them with a minimum representation. The same indel can often be placed at multiple positions and still
* represent the same haplotype. While the standard convention with VCF is to place an indel at the left-most position
* this isn't always done, so this tool can be used to left-align them. This tool optionally splits multiallelic
* sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from indels.
* sites into biallelics and left-aligns individual alleles. Optionally, the tool will not trim common bases from variants.
* </p>
*
* <h3>Input</h3>
Expand Down Expand Up @@ -172,8 +172,10 @@ public class LeftAlignAndTrimVariants extends VariantWalker {
private VariantContextWriter vcfWriter = null;
private VCFHeader vcfHeader = null;

VariantContext lastVariant;

private int thisVariantGroupStart = 0;
private String thisVariantGroupContig = null;
private VariantContext lastVariantWritten = null;
private final List<VariantContext> realignedVariants = new ArrayList<>();
@Override
public void onTraversalStart() {
final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
Expand Down Expand Up @@ -213,6 +215,17 @@ private Set<VCFHeaderLine> createVCFHeaderLineList(Map<String, VCFHeader> vcfHea
*/
@Override
public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) {
if (vc.getContig() != thisVariantGroupContig || vc.getStart() > thisVariantGroupStart) {
realignedVariants.sort(Comparator.comparingInt(VariantContext::getStart));
for (VariantContext realignedVariant : realignedVariants) {
vcfWriter.add(realignedVariant);
}
thisVariantGroupStart = vc.getStart();
thisVariantGroupContig = vc.getContig();
lastVariantWritten = realignedVariants.isEmpty() ? lastVariantWritten : realignedVariants.get(realignedVariants.size() - 1);
realignedVariants.clear();
}

final List<VariantContext> vcList;
if (splitMultiallelics) {
if (vc.getGenotypes().stream().anyMatch(g -> g.hasAnyAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY))) {
Expand All @@ -231,12 +244,10 @@ public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext
if (indelLength > maxIndelSize) {
logger.info(String.format("%s (%d) at position %s:%d; skipping that record. Set --max-indel-length >= %d",
"Indel is too long", indelLength, splitVariant.getContig(), splitVariant.getStart(), indelLength));
lastVariant = splitVariant;
vcfWriter.add(splitVariant);
realignedVariants.add(splitVariant);
} else {
final int distanceToLastVariant = (lastVariant != null && splitVariant.contigsMatch(lastVariant)) ? splitVariant.getStart() - lastVariant.getEnd() : Integer.MAX_VALUE;
lastVariant = GATKVariantContextUtils.leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant - 1), !dontTrimAlleles);
vcfWriter.add(lastVariant);
final int distanceToLastVariant = (lastVariantWritten != null && splitVariant.contigsMatch(lastVariantWritten)) ? splitVariant.getStart() - lastVariantWritten.getStart() : Integer.MAX_VALUE;
realignedVariants.add(GATKVariantContextUtils.leftAlignAndTrim(splitVariant, ref, Math.min(maxLeadingBases, distanceToLastVariant), !dontTrimAlleles));
}
}
}
Expand All @@ -256,6 +267,11 @@ public boolean requiresReference() {
*/
@Override
public Object onTraversalSuccess() {
//write out remaining variants
realignedVariants.sort(Comparator.comparingInt(VariantContext::getStart));
for (VariantContext realignedVariant : realignedVariants) {
vcfWriter.add(realignedVariant);
}
return "SUCCESS";
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2220,15 +2220,13 @@ public static <T> List<T> removeItemsByIndex(List<T> data, List<Integer> indexes
* By definition, it will only take biallelic vc's. Splitting into multiple alleles has to be
* handled by calling routine.
*
* @param vc Input VC with variants to left align
* @param vc Input VC with variants to left align and trim
* @param ref Reference context
* @return new VC.
*/
public static VariantContext leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBases, final boolean trim) {
if (!vc.isIndel() || maxLeadingBases <= 0) {
return vc;
}
public static VariantContext leftAlignAndTrim(final VariantContext vc, final ReferenceContext ref, final int maxLeadingBasesIndel, final boolean trim) {

final int maxLeadingBases = vc.isIndel() ? maxLeadingBasesIndel : 0;

for(int leadingBases = Math.min(maxLeadingBases, 10); leadingBases <= maxLeadingBases; leadingBases = Math.min(2*leadingBases, maxLeadingBases)) {
final int refStart = Math.max(vc.getStart() - leadingBases, 1);
Expand All @@ -2245,8 +2243,9 @@ public static VariantContext leftAlignAndTrim(final VariantContext vc, final Ref
return result;
}).collect(Collectors.toList());

final int boundStart = vc.isSNP() || vc.isMNP() ? variantOffsetInRef : variantOffsetInRef + 1; // +1 to ignore the shared base in front for indels
final List<IndexRange> alleleRanges = vc.getAlleles().stream()
.map(a -> new IndexRange(variantOffsetInRef + 1, variantOffsetInRef + a.length())) // +1 to ignore the shared base in front
.map(a -> new IndexRange(boundStart, variantOffsetInRef + a.length()))
.collect(Collectors.toList());

// note that this also shifts the index ranges as a side effect, so below they can be used to output allele bases
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,14 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . CAA GAA 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39586239 . CAATT CAATTA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586240 . AATTA AATT 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . TAAAA TAGAA 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,16 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . CAA GAA 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584948 . A AC 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39584950 . C G 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
chr21 39584951 . CA C 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39584953 . AA AG 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
chr21 39586239 . CAATT CAATTA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586240 . AATTA AATT 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . TAAAA TAGAA 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,16 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584948 . A AC 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39584950 . C G 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
chr21 39584951 . CA C 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39584954 . A G 30 . AC=2;AF=0.500;AN=4 GT 1/1 0/0
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,16 @@ chr21 10382389 . A ATT 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 10388233 . GGAA G 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255297 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255296 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39583817 . CTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCT C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . C G 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584948 . A AC 30 . AC=1;AC_Orig=1;AF=0.250;AF_Orig=0.250;AN=4;AN_Orig=4 GT 0/0 0/1
chr21 39584950 . C G 30 . AC=2;AC_Orig=2;AF=0.500;AF_Orig=0.500;AN=4;AN_Orig=4 GT 1/1 0/0
chr21 39584951 . CA C 30 . AC=1;AC_Orig=1;AF=0.250;AF_Orig=0.250;AN=4;AN_Orig=4 GT 0/0 0/1
chr21 39584954 . A G 30 . AC=2;AC_Orig=2;AF=0.500;AF_Orig=0.500;AN=4;AN_Orig=4 GT 1/1 0/0
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TA T 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . T TA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586245 . A G 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586744 . AC A 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,12 @@ chr21 10804284 . T TGC 30 . AC=1;AF=0.250;AN=2 GT 0/1 ./.
chr21 13255296 . A G 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 13255301 . AAA A 30 . AC=2;AF=0.500;AN=2 GT 1/1 ./.
chr21 39584006 . CTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCCTTCCC C 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584859 . CAA GAA 30 . AC=3;AF=0.750;AN=4 GT 1/1 0/1
chr21 39584950 . C G,CC 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39584953 . AA AG,A 30 . AC=2,1;AF=0.500,0.250;AN=4 GT 1/1 0/2
chr21 39586243 . TAAAA TAGAA 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TAAAA T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586243 . TAAAA TAAA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586243 . TAAAA TAAAAA 30 . AC=1;AF=0.250;AN=4 GT 0/0 0/1
chr21 39586743 . TACC T 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
chr21 39586745 . CC C 30 . AC=1;AF=0.250;AN=4 GT 0/1 0/0
Loading