Skip to content

Commit

Permalink
Partially complete draft of VCF strict specifications
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Cameron committed Jul 13, 2020
1 parent adbdf5b commit 14b4ca0
Show file tree
Hide file tree
Showing 3 changed files with 278 additions and 36 deletions.
22 changes: 12 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ PDFS = BCFv1_qref.pdf \
SAMv1.pdf \
SAMtags.pdf \
SAMstrict.pdf \
VCFstrict.pdf \
tabix.pdf \
VCFv4.1.pdf \
VCFv4.2.pdf \
Expand All @@ -20,16 +21,17 @@ pdf: $(PDFS:%=new/%)
%.pdf: new/%.pdf
cp $^ $@

new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver
new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver
new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver
new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver
new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver
new/SAMstrict.pdf diff/SAMtags.pdf: SAMstrict.tex new/SAMstrict.ver
new/VCFv4.1.pdf diff/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver
new/VCFv4.2.pdf diff/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver
new/VCFv4.3.pdf diff/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver
new/VCFv4.4.pdf diff/VCFv4.4.pdf: VCFv4.4.tex new/VCFv4.4.ver
new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver
new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver
new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver
new/SAMv1.pdf diff/SAMv1.pdf: SAMv1.tex new/SAMv1.ver
new/SAMtags.pdf diff/SAMtags.pdf: SAMtags.tex new/SAMtags.ver
new/SAMstrict.pdf diff/SAMstrict.pdf: SAMstrict.tex new/SAMstrict.ver
new/VCFstrict.pdf diff/SAMstrict.pdf: VCFstrict.tex new/VCFstrict.ver
new/VCFv4.1.pdf diff/VCFv4.1.pdf: VCFv4.1.tex new/VCFv4.1.ver
new/VCFv4.2.pdf diff/VCFv4.2.pdf: VCFv4.2.tex new/VCFv4.2.ver
new/VCFv4.3.pdf diff/VCFv4.3.pdf: VCFv4.3.tex new/VCFv4.3.ver
new/VCFv4.4.pdf diff/VCFv4.4.pdf: VCFv4.4.tex new/VCFv4.4.ver

PDFLATEX = pdflatex

Expand Down
51 changes: 25 additions & 26 deletions SAMstrict.tex
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@
\newcommand{\rulename}[1]{\tt #1}
\newcommand{\rulecategory}[1]{\tt #1}
\newcommand{\samrule}{\tt SAM}
\newcommand{\v15}{\tt v1.5}
\newcommand{\v15bestpractice}{\tt SAMv1.5 best practice}
\newcommand{\vcf43}{\tt VCFv4.3}
\newcommand{\vonefive}{\tt v1.5}
\newcommand{\vonefivebestpractice}{\tt SAMv1.5 best practice}
\newcommand{\vcffourthree}{\tt VCFv4.3}
% #1: error message
% #2: rule description
% #3: categories
\newcommand{\samstrictrule}[3]{
#
\paragraph{} #3
% error message formatting
{\tt #1}
Expand All @@ -25,7 +24,7 @@
% #2: categories
\newcommand{\headerrequired}[2]{
\samstrictrule{Missing #1 header}{A #1 header must be present.}{#2}
}\
}
\newcommand{\headerunique}[2]{
\samstrictrule{Only one #1 header may be present}{Multiple #1 headers must not be present.}{#2}
}
Expand Down Expand Up @@ -125,18 +124,18 @@ \section{Headers}
-\char126]+)+\$/} or {\tt /\char94@CO\char92t.*/} or {\tt /\char94@CO\char92t.*/} regex.}{\samrule}

\subsection{HD}
\headerrequired{HD}{\v15bestpractice}
\headerrequired{HD}{\vonefivebestpractice}
\headerunique{HD}
\headertagrequired{HD}{VN}{\samrule}
\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\v15bestpractice}
\samstrictrule{File does not start with HD header.}{The first header defined must be the HD header.}{\vonefivebestpractice}
\headertagregex{HD}{VN}{/\char94[0-9]+\char92.[0-9]+\$/}{\samrule}
\samstrictrule{Unknown SAM version}{The HD header VN tag version number must match a published version of the SAM specifications.}
\headertagvalues{HD}{SO}{{\tt unknown}, {\tt unsorted}, {\tt queryname} and {\tt coordinate}}{\samrule}
\headertagvalues{HD}{GO}{{\tt none}, {\tt query}, {\tt reference}}{\samrule}
\samstrictrule{Inconsistent HD header SO and GO tags}{The record orderings defined in the HD header SO and GO tags must be consistent}

\subsection{SQ}
\samstrictrule{Missing SQ header}{The SQ header must be present if any reads have been mapped.}{\v15bestpractice}
\samstrictrule{Missing SQ header}{The SQ header must be present if any reads have been mapped.}{\vonefivebestpractice}
\headertagrequired{SQ}{SN}{\samrule}
\headertagregex{SQ}{SN}{[!-)+-\char60\char62-\char126][!-\char126]*}{\samrule}
\headertagunique{SQ}{SN}
Expand Down Expand Up @@ -172,14 +171,14 @@ \subsection{File Format}
\samstrictrule{File is not UTF-8}{The file must use UTF-8 encoding.}{\samrule}
\samstrictrule{Inconsistent line terminators}{All lines must be separated with the same new line character\(s\).}
\samstrictrule{Malformed floating point value}{All floating point values must conform to the regex {\tt [-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?}}
\samstrictrule{Malformed integer value}{All integer values must conform to the regex [-+]?[0-9]+}}
\samstrictrule{Malformed integer value}{All integer values must conform to the regex {\tt [-+]?[0-9]+}}

\subsection{Ordering}

\samstrictrule{Record ordering does not match HD header SO tag}{The order of records must be consistent with the HD header SO tag}
\samstrictrule{Record ordering does not match HD header GO tag}{The order of records must be consistent with the HD header GO tag}
\samstrictrule{Orphaned unmapped read}{If a read is unmapped, RNAME and POS must either be * and 0, or the RNAME and POS of another read from the same template.}

\section{Records}

\subsection{QNAME}
Expand Down Expand Up @@ -209,42 +208,42 @@ \subsection{FLAG}
\samstrictrule{Multiple primary alignment records}{Each segment must have at most one record with FLAG 0x100 and 0x800 not set.}{\samrule}
\samstrictrule{Missing primary alignment record}{Each segment must have at least one record with FLAG 0x100 and 0x800 not set.}{\samrule}
\samstrictrule{Unknown FLAG bit set}{FLAG bits higher than 0x800 must not be set}{\samrule}
\samstrictrule{Unmapped reads should not have FLAG 0x10 set}{Unmapped reads should be stored in the orientation in which they came off the sequencing machine and have 0x10 unset.}{\v15bestpractice}
\samstrictrule{Unmapped reads should not have FLAG 0x10 set}{Unmapped reads should be stored in the orientation in which they came off the sequencing machine and have 0x10 unset.}{\vonefivebestpractice}

\subsection{RNAME}
\samstrictrule{Malformed RNAME}{RNAME must conform to the regex {\tt \char92*|[!-()+-\char60\char62-\char126][!-\char126]*}}{\samrule}
\samstrictrule{RNAME not present in reference}{RNAME must be equal to the value of one of the SQ SN values defined in the header.}
\samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcf43}
\samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcf43}
\samstrictrule{RNAME does not match mate}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have RNAME identical to its mate}{\v15bestpractice}
\samstrictrule{RNAME specified for unmapped template}{If all segments in a template are unmapped, their RNAME should be set as *}{\v15bestpractice}
\samstrictrule{RNAME contains character not supported by VCFv4.3}{RNAME must not contain any of the following characters: {\tt \char60\char62\char91\char93\char58\char42}}{\vcffourthree}
\samstrictrule{RNAME not supported by VCFv4.3}{RNAME must be not be one of {\tt DEL}, {\tt INS}, {\tt DUP}, {\tt INV}, {\tt CNV}, or {\tt BND}.}{\vcffourthree}
\samstrictrule{RNAME does not match mate}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have RNAME identical to its mate}{\vonefivebestpractice}
\samstrictrule{RNAME specified for unmapped template}{If all segments in a template are unmapped, their RNAME should be set as *}{\vonefivebestpractice}


\subsection{POS}
\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.}{\v15bestpractice}
\samstrictrule{Record placed outside of reference sequence}{If FLAG 0x4 is not set, POS must be between 0 and the length of the RNAME reference sequence inclusive. The length of the RNAME reference sequence can be found in the SQ header LN tag value for the SQ header with a SN tag matching the RNAME.}{\vonefivebestpractice}
\samstrictrule{Invalid POS}{POS cannot be 0 if FLAG 0x4 is set.}
\samstrictrule{Invalid POS}{POS cannot be negative.}{\samrule}
\samstrictrule{Invalid POS}{POS cannot be greater than 2147483647.}{\samrule}
\samstrictrule{POS specified without RNAME}{If RNAME is *, POS must be 0.}
\samstrictrule{POS does not match mate.}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have POS identical to its mate}{\v15bestpractice}
\samstrictrule{POS specified for unmapped template}{If all segments in a template are unmapped, their POS should be set as 0.}{\v15bestpractice}
\samstrictrule{POS does not match mate.}{For a unmapped paired-end or mate-pair read whose mate is mapped, the unmapped read should have POS identical to its mate}{\vonefivebestpractice}
\samstrictrule{POS specified for unmapped template}{If all segments in a template are unmapped, their POS should be set as 0.}{\vonefivebestpractice}

\subsection{MAPQ}
\samstrictrule{Invalid MAPQ}{MAPQ must be between 0 and 255 inclusive.}{\samrule}
\samstrictrule{Missing MAPQ}{Aligned reads should not have 255 MAPQ.}{\v15bestpractice}
\samstrictrule{Missing MAPQ}{Aligned reads should not have 255 MAPQ.}{\vonefivebestpractice}

\subsection{CIGAR}
\samstrictrule{Invalid CIGAR}{All CIGAR strings must conform to the regex {\tt \char92*|([0-9]+[MIDNSHPX=])+}}{\samrule}
\samstrictrule{Empty CIGAR}{All CIGAR strings must have at least one CIGAR operator}
\samstrictrule{Zero length CIGAR operator}{All CIGAR operators must have a non-zero positive length}
\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.}{\v15bestpractice}
\samstrictrule{CIGAR contains operator repeat}{All adjacent CIGAR operators must be different.}{\vonefivebestpractice}
\samstrictrule{CIGAR does not contain any mapped bases}{All CIGARs must include a CIGAR operator that consumes a reference base.}{\samrule}
{\tt Should we allow alignments with zero mapped bases? Seven bridges has a graph-based aligner that will
emit CIGARs such as 100I for alignments that align to a known insertion that is not included in the reference. Useful for local assembly but technical voilates the SAM specifications}
\samstrictrule{Incorrect CIGAR length}{Sum of lengths of the M/I/S/=/X operations must equal the length of SEQ when both CIGAR and SEQ are available.}{\samrule}
\samstrictrule{Invalid CIGAR hard clip}{H must only be present as the first and/or last operation.}{\samrule}
\samstrictrule{Invalid CIGAR soft clip}{S must only have H operations between them and the ends of the CIGAR string.}{\samrule}
\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.}{\v15bestpractice}
\samstrictrule{CIGAR overhangs reference sequence}{POS plus the number of reference bases consumed by the CIGAR must not exceed the length of the RNAME reference sequence.}{\vonefivebestpractice}
\samstrictrule{Inconsistent CIGAR read lengths}{All mapped alignments for a given segment must have matching read lengths. That is, the sum of lengths of the M/I/S/=/X/H operations must be equal.}

\samstrictrule{TODO: Unusual indel positioning}{TODO: should we disallow I/D operators at the ends of reads? There was some ambiguity as to how deletions interacted with POS but I think the spec has been updated in favour of the BWA interpretation since that discussion.}
Expand All @@ -269,7 +268,7 @@ \subsection{TLEN}
\subsection{SEQ}
\samstrictrule{Inconsistent SEQ read lengths}{All alignments of a given segment must have consistent SEQ lengths. That is, for all non-* SEQ, SEQ + length of CIGAR hard clip must be equal. }
\samstrictrule{Inconsistent SEQ sequences}{All alignments of a given segment with non-* SEQ must have consistent base calls. A base cannot be called an A in one record, but a T in another. Note that to determine the read base, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.}
\samstrictrule{SEQ of secondary alignments specified.}{SEQ of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\v15bestpractice}
\samstrictrule{SEQ of secondary alignments specified.}{SEQ of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\vonefivebestpractice}
\samstrictrule{Invalid sequence base}{Unless SEQ is "*", SEQ read bases must be one of the following characters: acmgrsvtwyhkdbnACMGRSVTWYHKDBN}
\samstrictrule{SEQ does not match reference when CIGAR indicates match.}{Unless SEQ is "*", read bases with CIGAR operator = must match the reference base. Bases are considered to match if overlap between the possible read and reference bases (based on their IUPAC codes) is non-zero.}
\samstrictrule{SEQ matches reference when CIGAR indicates mismatch.}{Unless SEQ is "*", read bases with CIGAR operator X must not match the reference base. Bases are considered to match on if, when ignoring case, the reference and read bases are the same character and the character is one of the following characters: acgtACGT.}
Expand All @@ -278,7 +277,7 @@ \subsection{QUAL}
\samstrictrule{QUAL specified without SEQ}{QUAL must be * if SEQ is *}{\samrule}
\samstrictrule{SEQ QUAL length mismatch.}{The length of a non-* QUAL must match the length of SEQ.}{\samrule}
\samstrictrule{Invalid QUAL}{The ASCII value of all QUAL bases must be at least 33.}
\samstrictrule{QUAL of secondary alignments specified.}{QUAL of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\v15bestpractice}
\samstrictrule{QUAL of secondary alignments specified.}{QUAL of secondary alignments (0x100 FLAG set) should be set to * to reduce the file size.}{\vonefivebestpractice}
\samstrictrule{TODO: QUAL edge case}{What should we do when a read is length 1 and the QUAL encodes to "*" ?}{\samrule}
\samstrictrule{Inconsistent QUAL scores}{All alignments with non-* QUAL of a given segment must have consistent base quality scores. Note that to determine the base quality, both the 0x10 FLAG, and any hard clipping CIGAR operators need to be taken into account.}

Expand All @@ -294,7 +293,7 @@ \subsection{Tag format}
\samstrictrule{Malformed B tag}{B tags must conform to the the regex {\tt [cCsSiIf](,[-+]?[0-9]*\char92.?[0-9]+([eE][-+]?[0-9]+)?)+}}{\samrule}
\samstrictrule{Non-integer value in integer array}{Type B tags starting with one of "cCsSiI" must contain integer values.}
\samstrictrule{Tag array value out of bounds}{Type B tags must not contain values that are greater than or less than the maximum or minimum value representable by the specified prefix.}
\samstrictrule{Tag integer out of bounds for BAM representation}{Type i tag values must be within the range ~$[-2^{31},2^{32})$}{\bam}
\samstrictrule{Tag integer out of bounds for BAM representation}{Type i tag values must be within the range {\tt ~$[-2^{31},2^{32})$}}{\bam}
\samstrictrule{Unknown reserved tag}{
No record can include any reserved tags not defined in the
{\sl Sequence Alignment/Map Optional Fields Specification}.
Expand All @@ -320,10 +319,10 @@ \subsection{Tag format}

\subsection{RG}
3 When a RG tag appears anywhere in the alignment section, there should be a single corresponding
@RG line with matching ID tag in the header.{\v15bestpractice}
@RG line with matching ID tag in the header.{\vonefivebestpractice}
\subsection{RG}
4 When a PG tag appears anywhere in the alignment section, there should be a single corresponding
@PG line with matching ID tag in the header.{\v15bestpractice}
@PG line with matching ID tag in the header.{\vonefivebestpractice}


\paragraph{}
Expand Down
Loading

0 comments on commit 14b4ca0

Please sign in to comment.