Skip to content

Commit a05afe2

Browse files
authored
Merge pull request #70 from cancerit/develop
merge 3.5.1 into main
2 parents 68fab43 + 667da4f commit a05afe2

File tree

8 files changed

+217
-51
lines changed

8 files changed

+217
-51
lines changed

CHANGES.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# CHANGES
22

3+
## 3.5.1
4+
* Dockerfile updated to avoid snv_merge_and_vaf_calc.R bug and to improve reproducibility
5+
* Small change to hardcoded overlapping_mask value in indel caller
6+
7+
## 3.5.0
8+
* Minor bug in the indel pipeline fixed (was using bitwise OR instead of logical OR)
9+
* New quality metrics added to the indel calls such that SNVs and indels come out with the same metrics
10+
11+
## 3.4.0
12+
13+
* Fixed faulty PART step that was dropping last genomic intervals for targeted experiments
14+
* Refactored how indels get merged to avoid errors when scaling up calculations
15+
* Updated htslibs, samtools, bcftools and libdeflate
16+
317
## 3.3.0
418

519
* Added new INFO fileds to the final vcf

Dockerfile

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,43 @@
1-
FROM ubuntu:22.04 as builder
1+
FROM ubuntu:18.04 as builder
22

33
USER root
44

55
# ALL tool versions used by opt-build.sh
6-
ENV VER_SAMTOOLS="1.14"
7-
ENV VER_HTSLIB="1.14"
8-
ENV VER_BCFTOOLS="1.14"
6+
ENV VER_SAMTOOLS="1.18"
7+
ENV VER_HTSLIB="1.18"
8+
ENV VER_BCFTOOLS="1.18"
99
ENV VER_VERIFYBAMID="2.0.1"
10-
ENV VER_LIBDEFLATE="v1.12"
10+
ENV VER_LIBDEFLATE="v1.18"
1111

1212
ENV DEBIAN_FRONTEND=noninteractive
1313
RUN apt-get -yq update
1414
RUN apt-get install -yq --no-install-recommends locales
1515
RUN apt-get install -yq --no-install-recommends g++
1616
RUN apt-get install -yq --no-install-recommends ca-certificates
17-
RUN apt-get install -yq --no-install-recommends cmake
17+
RUN apt-get install -yq --no-install-recommends wget
18+
19+
# install latest cmake so opt-build.sh works - the initial installs will also help install R
20+
RUN apt-get install -yq --no-install-recommends software-properties-common lsb-release
21+
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
22+
RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
23+
RUN apt-get install -yq --no-install-recommends cmake=3.25.2-0kitware1ubuntu18.04.1
24+
1825
RUN apt-get install -yq --no-install-recommends make
19-
RUN apt-get install -yq --no-install-recommends bzip2
20-
RUN apt-get install -yq --no-install-recommends gcc
2126
RUN apt-get install -yq --no-install-recommends pkg-config
22-
RUN apt-get install -yq --no-install-recommends wget
23-
RUN apt-get install -yq --no-install-recommends locales
24-
RUN apt-get install -yq --no-install-recommends r-base
27+
28+
# if ubuntu 18.04
29+
RUN apt install -yq --no-install-recommends dirmngr
30+
RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
31+
RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
32+
RUN apt-get install -yq --no-install-recommends r-base-core=4.1.3-1.1804.0
33+
RUN apt-mark hold r-base-core
34+
RUN apt-get install -yq --no-install-recommends r-cran-mass=7.3-51.5-2bionic0 r-cran-class=7.3-16-1bionic0 r-cran-nnet=7.3-13-1bionic0
35+
RUN apt-get install -yq --no-install-recommends r-recommended=4.1.3-1.1804.0
36+
RUN apt-get install -yq --no-install-recommends r-base=4.1.3-1.1804.0
37+
RUN apt-mark hold r-base r-recommended
38+
# if ubuntu 22.04
39+
# RUN apt-get install -yq --no-install-recommends r-base=4.1.2-1ubuntu2
40+
2541
RUN apt-get install -yq --no-install-recommends zlib1g-dev
2642
RUN apt-get install -yq --no-install-recommends libbz2-dev
2743
RUN apt-get install -yq --no-install-recommends liblzma-dev
@@ -57,11 +73,11 @@ RUN bash build/opt-build.sh $OPT
5773
COPY . .
5874
RUN bash build/opt-build-local.sh $OPT
5975

60-
FROM ubuntu:22.04
76+
FROM ubuntu:18.04
6177

6278
LABEL maintainer="[email protected]" \
6379
uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \
64-
version="1.0.0" \
80+
version="1.0.1" \
6581
description="nanoseq docker"
6682

6783
ENV DEBIAN_FRONTEND=noninteractive
@@ -70,19 +86,21 @@ RUN apt-get install -yq --no-install-recommends \
7086
apt-transport-https \
7187
locales \
7288
curl \
89+
wget \
90+
make \
91+
g++ \
92+
gcc \
93+
gfortran \
94+
libblas-dev \
95+
liblapack-dev \
7396
ca-certificates \
7497
time \
7598
zlib1g \
99+
libz-dev \
76100
python3 \
77-
r-base \
78-
r-cran-ggplot2 \
79-
r-cran-data.table \
80-
r-cran-epitools \
81-
r-cran-gridextra \
82-
r-cran-seqinr \
83101
libxml2 \
84-
libgsl27 \
85-
libperl5.34 \
102+
libgsl23 \
103+
libperl5.26 \
86104
libcapture-tiny-perl \
87105
libfile-which-perl \
88106
libpng16-16 \
@@ -92,6 +110,18 @@ unattended-upgrade -d -v && \
92110
apt-get remove -yq unattended-upgrades && \
93111
apt-get autoremove -yq
94112

113+
RUN apt install -yq --no-install-recommends software-properties-common dirmngr
114+
RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
115+
RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
116+
RUN apt-get install -yq --no-install-recommends r-base-core=4.1.3-1.1804.0
117+
RUN apt-mark hold r-base-core
118+
RUN apt-get install -yq --no-install-recommends r-cran-mass=7.3-51.5-2bionic0 r-cran-class=7.3-16-1bionic0 r-cran-nnet=7.3-13-1bionic0
119+
RUN apt-get install -yq --no-install-recommends r-recommended=4.1.3-1.1804.0
120+
RUN apt-get install -yq --no-install-recommends r-base=4.1.3-1.1804.0
121+
RUN apt-mark hold r-base r-recommended
122+
ADD build/libInstall2.R build/
123+
RUN Rscript build/libInstall2.R
124+
95125
RUN locale-gen en_US.UTF-8
96126
RUN update-locale LANG=en_US.UTF-8
97127

R/snv_merge_and_vaf_calc.R

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ if (length(grep("\\.gz", muts_vcf)) > 0) {
8686
} else {
8787
num_snvs = system(paste("grep -cv \"^#\" ", muts_vcf, "|| true"), intern = TRUE)
8888
}
89+
num_snvs = as.integer(num_snvs)
8990

9091
num_indels = 0
9192
if (indel_vcf != '-') {
@@ -96,6 +97,7 @@ if (indel_vcf != '-') {
9697
num_indels = system(paste("grep -cv \"^#\" ", indel_vcf, "|| true"), intern = TRUE)
9798
}
9899
}
100+
num_indels = as.integer(num_indels)
99101

100102
col_t = c("character","numeric","character","character","character","character","character")
101103
if (num_snvs == 0) {
@@ -245,8 +247,8 @@ if (num_snvs > 0) {
245247
snvs_new2[new_row, "BEND"] = paste(snvs_tmp[, "BEND"], collapse = ",")
246248
snvs_new2[new_row, "TRI"] = snvs_tmp[1, "TRI"]
247249
snvs_new2[new_row, "QPOS"] = paste(snvs_tmp[, "QPOS"], collapse = ",")
248-
snvs_new2[new_row, "DEPTH_FWD"] = median(snvs_tmp[, "DEPTH_FWD"])
249-
snvs_new2[new_row, "DEPTH_REV"] = median(snvs_tmp[, "DEPTH_REV"])
250+
snvs_new2[new_row, "DEPTH_FWD"] = median(as.numeric(snvs_tmp[, "DEPTH_FWD"]))
251+
snvs_new2[new_row, "DEPTH_REV"] = median(as.numeric(snvs_tmp[, "DEPTH_REV"]))
250252
snvs_new2[new_row, "DEPTH_NORM_FWD"] = snvs_tmp[1, "DEPTH_NORM_FWD"]
251253
snvs_new2[new_row, "DEPTH_NORM_REV"] = snvs_tmp[1, "DEPTH_NORM_REV"]
252254
snvs_new2[new_row, "TIMES_CALLED"] = freq
@@ -370,13 +372,26 @@ if (num_indels > 0) {
370372
indels_new[new_row, "MQ"] = indels_tmp[1, "MQ"]
371373
indels_new[new_row, "DP4"] = indels_tmp[1, "DP4"]
372374
indels_new[new_row, "mut_id"] = indels_tmp[1, "mut_id"]
375+
376+
indels_new[new_row, "BBEG"] = paste(indels_tmp[, "BBEG"], collapse = ",")
377+
indels_new[new_row, "BEND"] = paste(indels_tmp[, "BEND"], collapse = ",")
378+
indels_new[new_row, "QPOS"] = paste(indels_tmp[, "QPOS"], collapse = ",")
379+
indels_new[new_row, "DEPTH_FWD"] = median(as.numeric(indels_tmp[, "DEPTH_FWD"]))
380+
indels_new[new_row, "DEPTH_REV"] = median(as.numeric(indels_tmp[, "DEPTH_REV"]))
381+
indels_new[new_row, "DEPTH_NORM_FWD"] = indels_tmp[1, "DEPTH_NORM_FWD"]
382+
indels_new[new_row, "DEPTH_NORM_REV"] = indels_tmp[1, "DEPTH_NORM_REV"]
383+
indels_new[new_row, "DPLX_ASXS"] = paste(indels_tmp[, "DPLX_ASXS"], collapse = ",")
384+
indels_new[new_row, "DPLX_CLIP"] = paste(indels_tmp[, "DPLX_CLIP"], collapse = ",")
385+
indels_new[new_row, "DPLX_NM"] = paste(indels_tmp[, "DPLX_NM"], collapse = ",")
386+
indels_new[new_row, "BULK_ASXS"] = paste(indels_tmp[, "BULK_ASXS"], collapse = ",")
387+
indels_new[new_row, "BULK_NM"] = paste(indels_tmp[, "BULK_NM"], collapse = ",")
373388
}
374389
}
375390

376391
indels_new = indels_new[order(indels_new$chr, indels_new$pos),]
377392

378393
# drop some columns:
379-
indels_new = indels_new[, c("chr", "pos", "kk", "ref", "mut", "qual", "filter", "rb_id", "TYPE", "TIMES_CALLED", "SEQ")]
394+
indels_new = indels_new[, c("chr", "pos", "kk", "ref", "mut", "qual", "filter", "rb_id", "TYPE", "TIMES_CALLED", "SEQ","BBEG","BEND","QPOS","DEPTH_FWD","DEPTH_REV","DEPTH_NORM_FWD","DEPTH_NORM_REV","DPLX_ASXS","DPLX_CLIP","DPLX_NM","BULK_ASXS","BULK_NM")]
380395

381396
##########################################################################################
382397
# Calculate VAFs for indels
@@ -468,13 +483,23 @@ if (num_indels > 0) {
468483
indels_final$INFO = paste(indels_final$INFO, rep("DUPLEX_VAF=", nrow(indels_final)), indels_final$DUPLEX_VAF, ";", sep = "")
469484
indels_final$INFO = paste(indels_final$INFO, rep("BAM_VAF=", nrow(indels_final)), indels_final$BAM_VAF, ";", sep = "")
470485
indels_final$INFO = paste(indels_final$INFO, rep("BAM_VAF_BQ10=", nrow(indels_final)), indels_final$BAM_VAF_BQ10, ";", sep = "")
486+
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_NORM_FWD=", nrow(indels_final)), indels_final$DEPTH_NORM_FWD, ";", sep = "")
487+
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_NORM_REV=", nrow(indels_final)), indels_final$DEPTH_NORM_REV, ";", sep = "")
488+
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_FWD=", nrow(indels_final)), indels_final$DEPTH_FWD, ";", sep = "")
489+
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_REV=", nrow(indels_final)), indels_final$DEPTH_REV, ";", sep = "")
471490
indels_final$INFO = paste(indels_final$INFO, rep("SEQ=", nrow(indels_final)), indels_final$SEQ, ";", sep = "")
472491
indels_final$INFO = paste(indels_final$INFO, rep("DUPLEX_COV=", nrow(indels_final)), indels_final$DUPLEX_COV, ";", sep = "")
473492
indels_final$INFO = paste(indels_final$INFO, rep("BAM_MUT=", nrow(indels_final)), indels_final$BAM_MUT, ";", sep = "")
474493
indels_final$INFO = paste(indels_final$INFO, rep("BAM_COV=", nrow(indels_final)), indels_final$BAM_COV, ";", sep = "")
475494
indels_final$INFO = paste(indels_final$INFO, rep("BAM_MUT_BQ10=", nrow(indels_final)), indels_final$BAM_MUT_BQ10, ";", sep = "")
476495
indels_final$INFO = paste(indels_final$INFO, rep("BAM_COV_BQ10=", nrow(indels_final)), indels_final$BAM_COV_BQ10, ";", sep = "")
477496
indels_final$INFO = paste(indels_final$INFO, rep("RB=", nrow(indels_final)), indels_final$rb_id, "", sep = "")
497+
indels_final$INFO = paste(indels_final$INFO, rep("QPOS=", nrow(indels_final)), indels_final$QPOS, ";", sep = "")
498+
indels_final$INFO = paste(indels_final$INFO, rep("DPLX_ASXS=", nrow(indels_final)), indels_final$DPLX_ASXS, ";", sep = "")
499+
indels_final$INFO = paste(indels_final$INFO, rep("DPLX_CLIP=", nrow(indels_final)), indels_final$DPLX_CLIP, ";", sep = "")
500+
indels_final$INFO = paste(indels_final$INFO, rep("DPLX_NM=", nrow(indels_final)), indels_final$DPLX_NM, ";", sep = "")
501+
indels_final$INFO = paste(indels_final$INFO, rep("BULK_ASXS=", nrow(indels_final)), indels_final$BULK_ASXS, ";", sep = "")
502+
indels_final$INFO = paste(indels_final$INFO, rep("BULK_NM=", nrow(indels_final)), indels_final$BULK_NM, "", sep = "")
478503
}
479504

480505
# Describe in header:

build/libInstall2.R

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env Rscript
2+
3+
########## LICENCE ##########
4+
# Copyright (c) 2022 Genome Research Ltd
5+
#
6+
# Author: CASM/Cancer IT <[email protected]>
7+
#
8+
# This file is part of NanoSeq.
9+
#
10+
# This program is free software: you can redistribute it and/or modify
11+
# it under the terms of the GNU Affero General Public License as
12+
# published by the Free Software Foundation, either version 3 of the
13+
# License, or (at your option) any later version.
14+
#
15+
# This program is distributed in the hope that it will be useful,
16+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
# GNU Affero General Public License for more details.
19+
#
20+
# You should have received a copy of the GNU Affero General Public License
21+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
#
23+
# 1. The usage of a range of years within a copyright statement contained within
24+
# this distribution should be interpreted as being equivalent to a list of years
25+
# including the first and last year specified and all consecutive years between
26+
# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007-
27+
# 2009, 2011-2012’ should be interpreted as being identical to a statement that
28+
# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright
29+
# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being
30+
# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008,
31+
# 2009, 2010, 2011, 2012’.
32+
###########################
33+
34+
#install R packages
35+
install.packages(c("ggplot2", "data.table", "epitools", "gridExtra", "seqinr"))

build/opt-build.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,10 @@ else
107107
get_distro "libdeflate" "https://github.com/ebiggers/libdeflate/archive/$VER_LIBDEFLATE.tar.gz"
108108
tar --strip-components 1 -C libdeflate -zxf libdeflate.tar.gz
109109
cd libdeflate
110-
make -j$CPU CFLAGS="-fPIC -O3" libdeflate.a
111-
PREFIX=$INST_PATH make install
110+
cmake -B build
111+
cmake --build build
112+
cmake --install build
113+
cmake --install build --prefix $INST_PATH
112114
cd $SETUP_DIR
113115
rm -r libdeflate.tar.gz
114116
touch $SETUP_DIR/libdeflate.success

perl/indelCaller_step1.pl

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,9 @@
109109
my $bulk_rev = $bulkReverseTotal;
110110
next if($bulk_fwd + $bulk_rev < $bulk_min_cov); # Bulk minimum coverage
111111
next if($dplxCLIP > $max_clip);
112-
next if($dplxNM > 20); # made very liberal to allow long indels. Check the impact!
113-
next if($dplxASXS < $min_asxs | $bulkASXS < $min_asxs ); #fa8: fixed bug, we needed to check AS-XS for the bulk too
112+
next if($dplxNM > 20); # made very liberal to allow long indels. Check the impact! --> it seems is working, good
113+
next if($dplxASXS < $min_asxs || $bulkASXS < $min_asxs ); #fa8: fixed bug, we needed to check AS-XS for the bulk too
114+
# Fixed but: from bitwise OR to logical OR (ainsss)
114115

115116
if($r1 >= $min_size_subfam && $r2 >= $min_size_subfam) {
116117
my $bulktotal = $bulkForwardTotal+$bulkReverseTotal;
@@ -152,7 +153,17 @@
152153
$signature_trinuc = &reverse_signature($signature_trinuc);
153154
}
154155
$site_tags .= "$signature_trinuc;SW=$shearwater;cSNP=$commonSNP";
155-
156+
$site_tags .= ";BBEG=$dplxBreakpointBeg";
157+
$site_tags .= ";BEND=$dplxBreakpointEnd";
158+
$site_tags .= ";DEPTH_FWD=$r1";
159+
$site_tags .= ";DEPTH_REV=$r2";
160+
$site_tags .= ";DEPTH_NORM_FWD=$bulkForwardTotal";
161+
$site_tags .= ";DEPTH_NORM_REV=$bulkReverseTotal";
162+
$site_tags .= ";DPLX_ASXS=$dplxASXS";
163+
$site_tags .= ";DPLX_CLIP=$dplxCLIP";
164+
$site_tags .= ";DPLX_NM=$dplxNM";
165+
$site_tags .= ";BULK_ASXS=$bulkASXS";
166+
$site_tags .= ";BULK_NM=$bulkNM";
156167
# If seen in the bulk, flag it:
157168
if($bulkForwardIndel+$bulkReverseIndel > $max_vaf * $bulktotal) {
158169
$site_tags .= ";BULK_SEEN($dplxForwardIndel+$dplxReverseIndel/$bulktotal)";

0 commit comments

Comments
 (0)