From 14b4ca05f3b01adcea3c24919371a34da22b6d0e Mon Sep 17 00:00:00 2001 From: Daniel Cameron Date: Mon, 13 Jul 2020 22:57:42 +1000 Subject: [PATCH] Partially complete draft of VCF strict specifications --- Makefile | 22 ++--- SAMstrict.tex | 51 ++++++----- VCFstrict.tex | 241 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 278 insertions(+), 36 deletions(-) create mode 100644 VCFstrict.tex diff --git a/Makefile b/Makefile index 2a2617ccd..86e479b64 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ PDFS = BCFv1_qref.pdf \ SAMv1.pdf \ SAMtags.pdf \ SAMstrict.pdf \ + VCFstrict.pdf \ tabix.pdf \ VCFv4.1.pdf \ VCFv4.2.pdf \ @@ -20,16 +21,17 @@ pdf: $(PDFS:%=new/%) %.pdf: new/%.pdf cp $^ $@ -new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver -new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver -new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver -new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver -new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver -new/SAMstrict.pdf diff/SAMtags.pdf: SAMstrict.tex new/SAMstrict.ver -new/VCFv4.1.pdf diff/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver -new/VCFv4.2.pdf diff/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver -new/VCFv4.3.pdf diff/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver -new/VCFv4.4.pdf diff/VCFv4.4.pdf: VCFv4.4.tex new/VCFv4.4.ver +new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver +new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver +new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver +new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver +new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver +new/SAMstrict.pdf diff/SAMstrict.pdf: SAMstrict.tex new/SAMstrict.ver +new/VCFstrict.pdf diff/SAMstrict.pdf: VCFstrict.tex new/VCFstrict.ver +new/VCFv4.1.pdf diff/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver +new/VCFv4.2.pdf diff/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver +new/VCFv4.3.pdf diff/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver +new/VCFv4.4.pdf diff/VCFv4.4.pdf: VCFv4.4.tex new/VCFv4.4.ver PDFLATEX = pdflatex diff --git a/SAMstrict.tex b/SAMstrict.tex index b3a18c82a..67c4ca40b 100644 --- a/SAMstrict.tex +++ b/SAMstrict.tex @@ -7,14 +7,13 @@ \newcommand{\rulename}[1]{\tt #1} \newcommand{\rulecategory}[1]{\tt #1} \newcommand{\samrule}{\tt SAM} -\newcommand{\v15}{\tt v1.5} -\newcommand{\v15bestpractice}{\tt SAMv1.5 best practice} -\newcommand{\vcf43}{\tt VCFv4.3} +\newcommand{\vonefive}{\tt v1.5} +\newcommand{\vonefivebestpractice}{\tt SAMv1.5 best practice} +\newcommand{\vcffourthree}{\tt VCFv4.3} % #1: error message % #2: rule description % #3: categories \newcommand{\samstrictrule}[3]{ -# \paragraph{} #3 % error message formatting {\tt #1} @@ -25,7 +24,7 @@ % #2: categories \newcommand{\headerrequired}[2]{ \samstrictrule{Missing #1 header}{A #1 header must be present.}{#2} -}\ +} \newcommand{\headerunique}[2]{ \samstrictrule{Only one #1 header may be present}{Multiple #1 headers must not be present.}{#2} } @@ -125,10 +124,10 @@ \section{Headers} -\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/} or {\tt /\char94@CO\char92t.*/} regex.}{\samrule} \subsection{HD} -\headerrequired{HD}{\v15bestpractice} +\headerrequired{HD}{\vonefivebestpractice} \headerunique{HD} \headertagrequired{HD}{VN}{\samrule} -\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\v15bestpractice} +\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\vonefivebestpractice} \headertagregex{HD}{VN}{/\char94[0-9]+\char92.[0-9]+\$/}{\samrule} \samstrictrule{Unknown SAM version}{The HD header VN tag version number must match a published version of the SAM specifications.} \headertagvalues{HD}{SO}{{\tt unknown}, {\tt unsorted}, {\tt queryname} and {\tt coordinate}}{\samrule} @@ -136,7 +135,7 @@ \subsection{HD} \samstrictrule{Inconsistent HD header SO and GO tags}{The record orderings defined in the HD header SO and GO tags must be consistent} \subsection{SQ} -\samstrictrule{Missing SQ header}{The SQ header must be present if any reads have been mapped.}{\v15bestpractice} +\samstrictrule{Missing SQ header}{The SQ header must be present if any reads have been mapped.}{\vonefivebestpractice} \headertagrequired{SQ}{SN}{\samrule} \headertagregex{SQ}{SN}{[!-)+-\char60\char62-\char126][!-\char126]*}{\samrule} \headertagunique{SQ}{SN} @@ -172,14 +171,14 @@ \subsection{File Format} \samstrictrule{File is not UTF-8}{The file must use UTF-8 encoding.}{\samrule} \samstrictrule{Inconsistent line terminators}{All lines must be separated with the same new line character\(s\).} \samstrictrule{Malformed floating point value}{All floating point values must conform to the regex {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?}} -\samstrictrule{Malformed integer value}{All integer values must conform to the regex [-+]?[0-9]+}} +\samstrictrule{Malformed integer value}{All integer values must conform to the regex {\tt [-+]?[0-9]+}} \subsection{Ordering} \samstrictrule{Record ordering does not match HD header SO tag}{The order of records must be consistent with the HD header SO tag} \samstrictrule{Record ordering does not match HD header GO tag}{The order of records must be consistent with the HD header GO tag} \samstrictrule{Orphaned unmapped read}{If a read is unmapped, RNAME and POS must either be * and 0, or the RNAME and POS of another read from the same template.} - + \section{Records} \subsection{QNAME} @@ -209,42 +208,42 @@ \subsection{FLAG} \samstrictrule{Multiple primary alignment records}{Each segment must have at most one record with FLAG 0x100 and 0x800 not set.}{\samrule} \samstrictrule{Missing primary alignment record}{Each segment must have at least one record with FLAG 0x100 and 0x800 not set.}{\samrule} \samstrictrule{Unknown FLAG bit set}{FLAG bits higher than 0x800 must not be set}{\samrule} -\samstrictrule{Unmapped reads should not have FLAG 0x10 set}{Unmapped reads should be stored in the orientation in which they came off the sequencing machine and have 0x10 unset.}{\v15bestpractice} +\samstrictrule{Unmapped reads should not have FLAG 0x10 set}{Unmapped reads should be stored in the orientation in which they came off the sequencing machine and have 0x10 unset.}{\vonefivebestpractice} \subsection{RNAME} \samstrictrule{Malformed RNAME}{RNAME must conform to the regex {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*}}{\samrule} \samstrictrule{RNAME not present in reference}{RNAME must be equal to the value of one of the SQ SN values defined in the header.} -\samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcf43} -\samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcf43} -\samstrictrule{RNAME does not match mate}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have RNAME identical to its mate}{\v15bestpractice} -\samstrictrule{RNAME specified for unmapped template}{If all segments in a template are unmapped, their RNAME should be set as *}{\v15bestpractice} +\samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcffourthree} +\samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcffourthree} +\samstrictrule{RNAME does not match mate}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have RNAME identical to its mate}{\vonefivebestpractice} +\samstrictrule{RNAME specified for unmapped template}{If all segments in a template are unmapped, their RNAME should be set as *}{\vonefivebestpractice} \subsection{POS} -\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.}{\v15bestpractice} +\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.}{\vonefivebestpractice} \samstrictrule{Invalid POS}{POS cannot be 0 if FLAG 0x4 is set.} \samstrictrule{Invalid POS}{POS cannot be negative.}{\samrule} \samstrictrule{Invalid POS}{POS cannot be greater than 2147483647.}{\samrule} \samstrictrule{POS specified without RNAME}{If RNAME is *, POS must be 0.} -\samstrictrule{POS does not match mate.}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have POS identical to its mate}{\v15bestpractice} -\samstrictrule{POS specified for unmapped template}{If all segments in a template are unmapped, their POS should be set as 0.}{\v15bestpractice} +\samstrictrule{POS does not match mate.}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have POS identical to its mate}{\vonefivebestpractice} +\samstrictrule{POS specified for unmapped template}{If all segments in a template are unmapped, their POS should be set as 0.}{\vonefivebestpractice} \subsection{MAPQ} \samstrictrule{Invalid MAPQ}{MAPQ must be between 0 and 255 inclusive.}{\samrule} -\samstrictrule{Missing MAPQ}{Aligned reads should not have 255 MAPQ.}{\v15bestpractice} +\samstrictrule{Missing MAPQ}{Aligned reads should not have 255 MAPQ.}{\vonefivebestpractice} \subsection{CIGAR} \samstrictrule{Invalid CIGAR}{All CIGAR strings must conform to the regex {\tt \char92*|([0-9]+[MIDNSHPX=])+}}{\samrule} \samstrictrule{Empty CIGAR}{All CIGAR strings must have at least one CIGAR operator} \samstrictrule{Zero length CIGAR operator}{All CIGAR operators must have a non-zero positive length} -\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.}{\v15bestpractice} +\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.}{\vonefivebestpractice} \samstrictrule{CIGAR does not contain any mapped bases}{All CIGARs must include a CIGAR operator that consumes a reference base.}{\samrule} {\tt Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. Useful for local assembly but technical voilates the SAM specifications} \samstrictrule{Incorrect CIGAR length}{Sum of lengths of the M/I/S/=/X operations must equal the length of SEQ when both CIGAR and SEQ are available.}{\samrule} \samstrictrule{Invalid CIGAR hard clip}{H must only be present as the first and/or last operation.}{\samrule} \samstrictrule{Invalid CIGAR soft clip}{S must only have H operations between them and the ends of the CIGAR string.}{\samrule} -\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.}{\v15bestpractice} +\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.}{\vonefivebestpractice} \samstrictrule{Inconsistent CIGAR read lengths}{All mapped alignments for a given segment must have matching read lengths. That is, the sum of lengths of the M/I/S/=/X/H operations must be equal.} \samstrictrule{TODO: Unusual indel positioning}{TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion.} @@ -269,7 +268,7 @@ \subsection{TLEN} \subsection{SEQ} \samstrictrule{Inconsistent SEQ read lengths}{All alignments of a given segment must have consistent SEQ lengths. That is, for all non-* SEQ, SEQ + length of CIGAR hard clip must be equal. } \samstrictrule{Inconsistent SEQ sequences}{All alignments of a given segment with non-* SEQ must have consistent base calls. A base cannot be called an A in one record, but a T in another. Note that to determine the read base, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} -\samstrictrule{SEQ of secondary alignments specified.}{SEQ of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\v15bestpractice} +\samstrictrule{SEQ of secondary alignments specified.}{SEQ of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\vonefivebestpractice} \samstrictrule{Invalid sequence base}{Unless SEQ is "*", SEQ read bases must be one of the following characters: acmgrsvtwyhkdbnACMGRSVTWYHKDBN} \samstrictrule{SEQ does not match reference when CIGAR indicates match.}{Unless SEQ is "*", read bases with CIGAR operator = must match the reference base. Bases are considered to match if overlap between the possible read and reference bases (based on their IUPAC codes) is non-zero.} \samstrictrule{SEQ matches reference when CIGAR indicates mismatch.}{Unless SEQ is "*", read bases with CIGAR operator X must not match the reference base. Bases are considered to match on if, when ignoring case, the reference and read bases are the same character and the character is one of the following characters: acgtACGT.} @@ -278,7 +277,7 @@ \subsection{QUAL} \samstrictrule{QUAL specified without SEQ}{QUAL must be * if SEQ is *}{\samrule} \samstrictrule{SEQ QUAL length mismatch.}{The length of a non-* QUAL must match the length of SEQ.}{\samrule} \samstrictrule{Invalid QUAL}{The ASCII value of all QUAL bases must be at least 33.} -\samstrictrule{QUAL of secondary alignments specified.}{QUAL of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\v15bestpractice} +\samstrictrule{QUAL of secondary alignments specified.}{QUAL of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\vonefivebestpractice} \samstrictrule{TODO: QUAL edge case}{What should we do when a read is length 1 and the QUAL encodes to "*" ?}{\samrule} \samstrictrule{Inconsistent QUAL scores}{All alignments with non-* QUAL of a given segment must have consistent base quality scores. Note that to determine the base quality, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.} @@ -294,7 +293,7 @@ \subsection{Tag format} \samstrictrule{Malformed B tag}{B tags must conform to the the regex {\tt [cCsSiIf](,[-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?)+}}{\samrule} \samstrictrule{Non-integer value in integer array}{Type B tags starting with one of "cCsSiI" must contain integer values.} \samstrictrule{Tag array value out of bounds}{Type B tags must not contain values that are greater than or less than the maximum or minimum value representable by the specified prefix.} -\samstrictrule{Tag integer out of bounds for BAM representation}{Type i tag values must be within the range ~$[-2^{31},2^{32})$}{\bam} +\samstrictrule{Tag integer out of bounds for BAM representation}{Type i tag values must be within the range {\tt ~$[-2^{31},2^{32})$}}{\bam} \samstrictrule{Unknown reserved tag}{ No record can include any reserved tags not defined in the {\sl Sequence Alignment/Map Optional Fields Specification}. @@ -320,10 +319,10 @@ \subsection{Tag format} \subsection{RG} 3 When a RG tag appears anywhere in the alignment section, there should be a single corresponding -@RG line with matching ID tag in the header.{\v15bestpractice} +@RG line with matching ID tag in the header.{\vonefivebestpractice} \subsection{RG} 4 When a PG tag appears anywhere in the alignment section, there should be a single corresponding -@PG line with matching ID tag in the header.{\v15bestpractice} +@PG line with matching ID tag in the header.{\vonefivebestpractice} \paragraph{} diff --git a/VCFstrict.tex b/VCFstrict.tex new file mode 100644 index 000000000..16dbe4f5b --- /dev/null +++ b/VCFstrict.tex @@ -0,0 +1,241 @@ +\documentclass[10pt]{article} +\usepackage[margin=1in]{geometry} +\usepackage{longtable} +\usepackage[pdfborder={0 0 0},hyperfootnotes=false]{hyperref} +\usepackage[title]{appendix} + +% #1: short error code +% #2: short error description +% #3: rule description +% #4: categories +% #5: additional explanatory text +\newcommand{\vcfstrictrule}[5]{ + \paragraph{#1} #2 #4 + #5 + \par +} +% Rule is part of the base VCF specifications +\newcommand{\vcfspec}{\tt VCF} +\newcommand{\SPECISSUE}[1]{\paragraph{} #1} +\newcommand{\TODO}[1]{\paragraph{TODO} complete this section: #1} +% #1: Meta-information key +% #2: Missing field +\newcommand{\structuredheadermissingfield}[2] { + \vcfstrictrule{mi.#1.#2.missing}{Missing meta-information \tt{#1} \tt{#2} key}{Missing \tt{#1} field for \tt{#2} meta-information line.}{\vcfspec}{} +} +\newcommand{\externalfilevalidation}[5] { + \TODO{Should we check external URL be included? My preference is that is it not and the validation should be entirely self-contained in the VCF} +} +\newcommand{\phredoob}[2] { + \vcfstrictrule{#1}{#2}{Phred-scaled fields must be greater than or equal to zero.}{}{} +} + +\begin{document} + +\input{VCFstrict.ver} +\title{VCF Strict Specification} +\author{Daniel L Cameron} +\date{\headdate} +\maketitle +\begin{quote}\small +The master version of this document can be found at +\url{https://github.com/samtools/hts-specs}.\\ +This printing is version~\commitdesc\ from that repository, +last modified on the date shown above. +\end{quote} +\vspace*{1em} + +\noindent +This document is a companion to the {\sl Variant Call Format Specification} that defines the VCF file format. +\footnote{See \href{http://samtools.github.io/hts-specs/VCFv4.3.pdf}{\tt VCFv4.3.pdf} at \url{https://github.com/samtools/hts-specs}.} +The VCF file format defines the syntax required for a file to be a valid VCF file. +It does not require such files to be semantically valid and internally consistent. +This document describes a set of additional semantic restrictions for which the subset of syntactically valid VCF files that comply with these restrictions can be described as \textit{VCF strict compliant}. + +\renewcommand{\abstractname}{Introduction} +\begin{abstract} + +The VCF specifications have been instrumental in standardising the file formats used for variant calling. +A large ecosystem of bioinformatics tools is now capable of reading and/or writing VCF files. +Unfortunately, many tools that read VCF files are tightly coupled to a particular upstream tool and fail to correctly execute on valid VCF files written by other tools. +In part, this is due to the lack of semantic restrictions inherent in the VCF file format. +A syntactically valid VCF file can be both internally inconsistent and semantically nonsensical. + +The purpose of this document is to provide a baseline of semantic validity for which tools should comply with when outputing VCF files, and tools which input VCF files can safely assume when they require input files to be \textit{VCF strict compliant}. + +\end{abstract} + +\section{Format} + +\vcfstrictrule{file.encoding}{Invalid file encoding}{File is not a valid UTF-8 file.}{\vcfspec}{} +\vcfstrictrule{file.newlines}{Inconsistent newlines}{File mixes CR and CR+LF line terminators.}{}{} +\vcfstrictrule{file.blankline}{Blank line}{File contains a blank line.}{}{} + +\section{Meta-information Lines} + + +\vcfstrictrule{mi.keyvalue.malformed}{Malformed Meta-information line}{Meta-information line is not of the form key=value}{\vcfspec}{} +\vcfstrictrule{mi.key.malformed}{Invalid Meta-information key}{Meta-information line key must conform to the regex \tt{[::alpha::]+}}{}{} + +\subsection{Structured fields} + +\vcfstrictrule{mi.structured.value.malformed}{Malformed structured meta-information}{Structured meta-information line value does not start with \tt{<} and end with \tt{>}.}{\vcfspec}{} +\vcfstrictrule{mi.structured.extrafield.position}{Incorrrectly placed structured meta-information extra field }{Structured meta-information extra field located before a default field.}{\vcfspec}{} +\vcfstrictrule{mi.structured.extrafield.malformed}{Incorrectly type structured meta-information extra field}{Structured meta-information extra field not start and end with \tt{"}.}{\vcfspec}{} +\vcfstrictrule{mi.structured.duplicated}{Duplicate structured meta-information line}{Multiple meta-information lines with with same key and \tt{ID} found.}{\vcfspec}{} + +\SPECISSUE{What's the point of quotes in structured header fields? Just so they can contain commas?} + +\subsection{fileformat} +\vcfstrictrule{mi.fileformat.missing}{Missing fileformat}{fileformat meta-information line is missing}{\vcfspec}{} +\vcfstrictrule{mi.fileformat.position}{fileformat not first}{fileformat meta-information line is not the first line}{\vcfspec}{} +\vcfstrictrule{mi.fileformat.invalid}{Malformed fileformat}{fileformat value is not one of \tt{VCFv4.1}, \tt{VCFv4.2}, \tt{VCFv4.3}, \tt{VCFv4.4} }{\vcfspec}{} + +\subsection{INFO} + +\structuredheadermissingfield{INFO}{ID} +\structuredheadermissingfield{INFO}{Number} +\structuredheadermissingfield{INFO}{Type} +\structuredheadermissingfield{INFO}{Description} + +\vcfstrictrule{mi.INFO.ID.malformed}{Malformed meta-information \tt{INFO} \tt{ID} field}{INFO ID field does not match to regex \tt{\^([A-Za-z\_][0-9A-Za-z\_.]*|1000G)\$}}{\vcfspec}{} +\vcfstrictrule{mi.INFO.Number.malformed}{Malformed meta-information \tt{INFO} \tt{Number} field}{INFO Number field is not a positive integer, \tt{A}, \tt{R}, \tt{G}, or \tt{.}.}{\vcfspec}{} +\TODO{Check fields are valid for each VCF version} +\vcfstrictrule{mi.INFO.Type.malformed}{Malformed meta-information \tt{INFO} \tt{Type} field}{INFO Type field is not one of \tt{Integer}, \tt{Float}, \tt{Flag}, \tt{Character}, \tt{String}.}{\vcfspec}{} + +\subsection{FILTER} + +\structuredheadermissingfield{FILTER}{ID} +\structuredheadermissingfield{FILTER}{Description} + +\subsection{FORMAT} + +\structuredheadermissingfield{FORMAT}{ID} +\structuredheadermissingfield{FORMAT}{Number} +\structuredheadermissingfield{FORMAT}{Type} +\structuredheadermissingfield{FORMAT}{Description} + +\vcfstrictrule{mi.INFO.ID.malformed}{Malformed meta-information \tt{INFO} \tt{ID} field}{INFO ID field does not match the regex \tt{\^[A-Za-z\_][0-9A-Za-z\_.]*}}{\vcfspec}{} +\vcfstrictrule{mi.INFO.Number.malformed}{Malformed meta-information \tt{INFO} \tt{Number} field}{INFO Number field is not a positive integer, \tt{A}, \tt{R}, \tt{G}, or \tt{.}.}{\vcfspec}{} +\TODO{Check fields are valid for each VCF version} +\vcfstrictrule{mi.INFO.Type.malformed}{Malformed meta-information \tt{INFO} \tt{Type} field}{INFO Type field is not one of \tt{Integer}, \tt{Float}, \tt{Character}, \tt{String}.}{\vcfspec}{} + + +\subsection{ALT} + +\structuredheadermissingfield{FILTER}{ID} +\structuredheadermissingfield{FILTER}{Description} + +\SPECISSUE{CNV, BND are a valid 3-base IUPAC code. Very bad. DUP also problematic for RNA} +\SPECISSUE{Why are IUPAC codes here? Seems like a bad idea to have to define every possible IUPAC indel used} +\SPECISSUE{BND is not actually a valid ALT allele.} +\SPECISSUE{DUP/DEL is defined as SVCLAIM=CN} + +\subsection{assembly} +\TODO{Should checking the URL be included? My preference is that is it not and the validation should be entirely self-contained in the VCF} +\externalfilevalidation{assembly.missingfile} + +\SPECISSUE{What happens if there are multiple assembly files specified?} +\SPECISSUE{Why must the assembly file be a fasta file? GRIDSS uses a BAM file for breakpoint assembly contigs.} +\SPECISSUE{This is defined as a breakpoint assembly file, but 1.6.1.1 refers to it directly. Is this an inconsisent double-use of this header field?} + +\vcfstrictrule{mi.assembly.contig.reserved}{Assembly contig name is reserved.}{ The assembly file contains a reserved contig name.}{\vcfspec}{ +Reserved contigs names are contigs named, or containing a colon and starting with any of \tt{DEL}, \tt{DUP}, \tt{INV}, \tt{INS}, \tt{CNV}, \tt{*}. +} + +\subsection{contig} + +\structuredheadermissingfield{contig}{ID} +\structuredheadermissingfield{length}{ID} % Should not be in \vcfspec category since it's not required by the specs +\vcfstrictrule{mi.contig.ID.malformed}{Malformed meta-information \tt{contig} \tt{ID} field}{contig ID field does not match the regex \tt{[0-9A-Za-z!\#\$\%\&+./:;?@\^\_|\~-][0-9A-Za-z!\#\$\%\&*+./:;=?@\^\_|\~-]*}.}{\vcfspec}{} +\vcfstrictrule{mi.contig.length.malformed}{Malformed meta-information \tt{contig} \tt{length} field}{contig length field is not an integer.}{\vcfspec}{} +\vcfstrictrule{mi.contig.length.outofbounds}{Meta-information \tt{contig} \tt{length} field out of bounds.}{Out of bounds contig length field. Minimum value is 0. Maximum value is 2,147,483,647 (2^31-1). }{\bcf}{ +BCF encodes position using a signed 32 bit integer. +} +\externalfilevalidation{mi.contig.url} + +\subsection{SAMPLE / META / PEDIGREE } + +\SPECISSUE{These aren't specified nearly well enough.} +\SPECISSUE{META and SAMPLE are not defined as structured fields in s1.4.0} + +\subsection{pedigreeDB } + +\externalfilevalidation{mi.pedigreeDB} + +\section{Header} + +\vcfstrictrule{header.sampleID.duplicate}{Duplicate sample ID}{Duplicated sample ID found.}{\vcfspec}{} +\vcfstrictrule{header.sampleID.empty}{Empty sample ID}{Header sample ID must be at least 1 character in length.}{}{} + +\section{Data lines} + +\vcfstrictrule{line.length.mismatch}{Mismatching sample count}{The number of sample genotype information has been provided for does not match the number of samples defined in the header.}{}{} +\vcfstrictrule{line.CHROM.grouped}{CHROM ungrouped}{Records are not grouped by CHROM.}{}{\vcfspec}{} +\vcfstrictrule{line.POS.outoforder}{CHROM unsorted}{Records grouped by CHROM are not in ascending order by POS.}{}{} +\vcfstrictrule{line.CHROM.outoforder}{CHROM unsorted}{CHROM ordering does not match the order of the meta-information contig records.}{}{} + +\subsection{Fixed fields} + +\subsubsection{CHROM} + +\vcfstrictrule{CHROM.missing}{Missing contig}{ No ##contig meta-information line found for this records. Does not apply to angle-bracketed ID Strings. }{}{} +\vcfstrictrule{CHROM.assembly.assembly.missing}{Missing assembly file}{ No assembly file specified using ##assembly. Applies only to angle-bracketed ID Strings. }{\vcfspec}{} +\vcfstrictrule{CHROM.assembly.missing}{Missing assembly contig}{ Assembly file contig identifier not found in the assembly file. Applies only to angle-bracketed ID Strings. }{\vcfspec}{} + +\subsubsection{POS} + +\vcfstrictrule{POS.outofbounds}{POS out of bounds }{ Value of out of representable bounds. Minimum value is 0. Maximum value is 2,147,483,647 (2^31-1). }{\vcfspec}{} +\vcfstrictrule{POS.contig.outofbounds}{POS exceeds contig length. }{ POS must be less than, or equal to, the contig length + 1 }{}{} +\vcfstrictrule{POS.telomere.nonbnd}{Telomeric records must be BND }{ Telomeric with POS of 0 or contig length + 1 must be BND symbolic alleles.}{}{} + +\subsubsection{POS} + +\vcfstrictrule{ID.duplicate}{ Duplicate ID }{ One or more of the semi-colon separated IDs in this field is not unique. }{\vcfspec}{} +\SPECISSUE{VCF merging problematic if different VCFs have different INFO.} + +\subsubsection{REF} + +\vcfstrictrule{REF.malformed}{ Invalid REF }{ REF does not match the regex [ACGTNacgtn]+. }{\vcfspec}{} +\vcfstrictrule{REF.reference.mismatch}{REF does not match reference }{ REF does not match reference genome sequence. }{}{} + +\subsubsection{ALT} + +\vcfstrictrule{ALT.malformed}{ Invalid ALT }{ ALT does not match the regex [ACGTNacgtn]+, \tt{.}, \tt{*}, a breakpoint string, a single breakend string, or a symbolic allele. }{\vcfspec}{} +\SPECISSUE{Is "ACT,." a valid ALT? The definition of QUAL could be read to mean that . must be the only ALT if it is supplied.} +\SPECISSUE{Single breakend are not explicitly required to have at least one base - "." could be interpreted as a single breakend.} +\vcfstrictrule{ALT.duplicate}{ Duplicated ALT }{ ALT alleles are not unique within this record. }{}{} +\vcfstrictrule{ALT.breakpoint.POS.outofbounds }{ The breakpoint is out of bounds. }{ The position of the other side of the breakpoint occurs is greater than the relevant contig length + 1}{}{} +\vcfstrictrule{ALT.breakpoint.CHROM.missing }{ The breakpoint contig is not valid. }{ No ##contig meta-information line found for this record, and, if angle bracketed, is not found in the assembly file. }{}{} +\vcfstrictrule{ALT.breakpoint.telomere.orientation }{The breakpoint orientation is invalid.}{ If the breakpoint position is 0 or contig length + 1, the breakpoint orientation must be towards to telomere. }{}{} + +\subsubsection{QUAL} + +\phredoob{QUAL.outofbounds}{QUAL out of bounds} + +\subsubsection{FILTER} + +\vcfstrictrule{FILTER.malformed}{Malformed FILTER}{ Filter cannot be \tt{0}.}{\vcfspec}{} +\vcfstrictrule{FILTER.empty}{Empty FILTER}{ Filter cannot be the empty string.}{}{} +\vcfstrictrule{FILTER.missing}{Invalid FILTER}{ Missing FILTER meta-information line. }{}{} +\vcfstrictrule{FILTER.missingvalue.notsolo}{FILTER MISSING value must only record.}{ No other values can be present if the MISSING value is present.}{}{} +\vcfstrictrule{FILTER.duplicate}{Duplicated FILTER}{ FILTER is not unique within this record. }{}{} + +\section{Sematics} + +\subsection { Breakpoints } +\vcfstrictrule{breakpoint.MATEID.missing}{Breakpoint MATEID required}{Breakpoint record must have MATEID specified.}{}{} +\vcfstrictrule{breakpoint.MATEID.malformed}{Malformed MATEID}{MATEID cannot be the MISSING value.}{}{} +\vcfstrictrule{breakpoint.ID.missing}{Breakpoint ID missing}{Breakpoint ID cannot be the MISSING value \tt{.}.} +\vcfstrictrule{breakpoint.mate.missing}{Missing breakpoint mate}{A breakpoint record with ID matching MATEID must exist.}{}{} +\vcfstrictrule{breakpoint.POS.mismatch}{Breakpoint POS mismatch}{The POS of the matching breakpoint record does not match the position in the ALT field}{}{} +\vcfstrictrule{breakpoint.CHROM.mismatch}{Breakpoint CHROM mismatch}{The CHROM of the matching breakpoint record does not match the contig in the ALT field}{}{} +\vcfstrictrule{breakpoint.HOMLEN.mismatch}{Breakpoint HOMLEN mismatch}{The HOMLEN of the matching breakpoint record does not match the HOMLEN of this record.}{}{} +\vcfstrictrule{breakpoint.HOMSEQ.mismatch}{Breakpoint HOMSEQ mismatch}{After adjusting for breakend orientations, the HOMSEQ of the matching breakpoint record does not match the HOMSEQ of this record.}{}{} +\vcfstrictrule{breakpoint.CIPOS.mismatch}{Breakpoint CIPOS mismatch}{After adjusting for breakend orientations, the CIPOS of the matching breakpoint record does not match the CIPOS of this record.}{}{ +Note: this rule only applies to events that are not IMPRECISE. +A breakpoint can validly have an different confidence intervals on either side of a breakpoint. +For example, a breakpoint into a poly-A stretch of indeterminate length can have the position known exactly on one side, but a wide CIPOS on the side with a poly-A reference sequence. +} + +\end{document}