Skip to content

Commit

Permalink
Merge pull request #120 from J35P312/master
Browse files Browse the repository at this point in the history
Tiddit 3.9.0
  • Loading branch information
J35P312 authored Nov 19, 2024
2 parents 9bb201f + 5106380 commit 14d1422
Show file tree
Hide file tree
Showing 13 changed files with 502 additions and 174 deletions.
37 changes: 0 additions & 37 deletions CMakeLists.txt

This file was deleted.

46 changes: 9 additions & 37 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,55 +1,27 @@
FROM python:3.8-slim

RUN apt-get update && \
apt-get upgrade -y && \
apt-get install -y \
autoconf \
automake \
build-essential \
cmake \
libbz2-dev \
libcurl4-gnutls-dev \
liblzma-dev \
libncurses5-dev \
libssl-dev \
make \
unzip \
wget \
zlib1g-dev && \
apt-get clean && \
apt-get purge && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
FROM condaforge/mambaforge:24.9.2-0

WORKDIR /app

## Install samtools for cram processing
RUN wget --no-verbose https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2 && \
bunzip2 samtools-1.10.tar.bz2 && \
tar -xf samtools-1.10.tar && \
cd samtools-1.10 && \
./configure && \
make all all-htslib && \
make install install-htslib && \
rm /app/samtools-1.10.tar

## Set TIDDIT version
ARG TIDDIT_VERSION=2.12.1
ARG TIDDIT_VERSION=3.9.0

## Add some info
LABEL base_image="python:3.8-slim"
LABEL software="TIDDIT.py"
LABEL software.version=${TIDDIT_VERSION}

## Download and extract
RUN conda install conda-forge::unzip
RUN conda install -c conda-forge pip gcc joblib
RUN conda install -c bioconda numpy cython pysam bwa

RUN wget https://github.com/SciLifeLab/TIDDIT/archive/TIDDIT-${TIDDIT_VERSION}.zip && \
unzip TIDDIT-${TIDDIT_VERSION}.zip && \
rm TIDDIT-${TIDDIT_VERSION}.zip

## Install
RUN cd TIDDIT-TIDDIT-${TIDDIT_VERSION} && \
./INSTALL.sh && \
chmod +x /app/TIDDIT-TIDDIT-${TIDDIT_VERSION}/TIDDIT.py && \
ln -s /app/TIDDIT-TIDDIT-${TIDDIT_VERSION}/TIDDIT.py /usr/local/bin
RUN cd TIDDIT-TIDDIT-${TIDDIT_VERSION} && \
pip install -e .

ENTRYPOINT ["TIDDIT.py"]
ENTRYPOINT ["tiddit"]
CMD ["--help"]
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
DESCRIPTION
==============
TIDDIT: Is a tool to used to identify chromosomal rearrangements using Mate Pair or Paired End sequencing data. TIDDIT identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions, using supplementary alignments as well as discordant pairs.
TIDDIT searches for discordant reads and splti reads (supplementary alignments). The supplementary alignments are assembled and aligned using a fermikit-like workflow.
TIDDIT searches for discordant reads and split reads (supplementary alignments). TIDDIT also performs local assembly using a custom local de novo assembler.
Next all signals (contigs, split-reads, and discordant pairs) are clustered using DBSCAN. The resulting clusters are filtered and annotated, and reported as SV depending on the statistics.
TIDDIT has two analysis modules. The sv mode, which is used to search for structural variants. And the cov mode that analyse the read depth of a bam file and generates a coverage report.
On a 30X human genome, the TIDDIT SV module typically completetes within 5 hours, and requires less than 10Gb ram.
Expand All @@ -27,11 +27,11 @@ cd tiddit
pip install -e .
```

Next install fermi2, ropebwt2, and bwa, I recommend using conda:
Next install bwa, I recommend using conda:

conda install fermi2 ropebwt2 bwa
conda install bwa

You may also compile bwa, fermi2, and ropebwt2 yourself. Remember to add executables to path, or provide path through the command line parameters.
You may also compile bwa yourself. Remember to add executables to path, or provide path through the command line parameters.
```
tiddit --help
tiddit --sv --help
Expand Down
36 changes: 0 additions & 36 deletions Singularity

This file was deleted.

4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@
"tiddit/tiddit_cluster.pyx",
"tiddit/tiddit_coverage_analysis.pyx",
"tiddit/tiddit_gc.pyx",
"tiddit/silverfish.pyx",
"tiddit/graphlib.pyx",
"tiddit/tiddit_variant.pyx",
"tiddit/tiddit_contig_analysis.pyx"])
else:
ext_modules = []

setup(
name = 'tiddit',
version = '3.8.0',
version = '3.9.0',


url = "https://github.com/SciLifeLab/TIDDIT",
Expand Down
2 changes: 1 addition & 1 deletion tiddit/DBSCAN.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def x_coordinate_clustering(data,epsilon,m):

distances=[]
current=data[i,:]
points=data[i+1:i+m,:]
points=data[i+1:i+m+1,:]

#print points
distances=[]
Expand Down
41 changes: 23 additions & 18 deletions tiddit/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import tiddit.tiddit_gc as tiddit_gc

def main():
version="3.8.0"
version="3.9.0"
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")
Expand All @@ -27,9 +27,13 @@ def main():
if args.sv == True:

parser = argparse.ArgumentParser("""tiddit --sv --bam inputfile [-o prefix] --ref ref.fasta""")
#required parameters
parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true")
parser.add_argument('--force_overwrite' , help="force the analysis and overwrite any data in the output folder", required=False, action="store_true")
parser.add_argument('--bam', type=str,required=True, help="coordinate sorted bam file(required)")
parser.add_argument('--ref', type=str, help="reference fasta",required=True)

#related to sv calling
parser.add_argument('-o', type=str,default="output", help="output prefix(default=output)")
parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)")
parser.add_argument('-d', type=str,help="expected reads orientations, possible values \"innie\" (-> <-) or \"outtie\" (<- ->). Default: major orientation within the dataset")
Expand All @@ -42,19 +46,28 @@ def main():
parser.add_argument('-c', type=float, help="average coverage, overwrites the estimated average coverage (useful for exome or panel data)")
parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set >= 2")
parser.add_argument('-s', type=int,default=25000000, help="Number of reads to sample when computing library statistics(default=25000000)")
parser.add_argument('-z', type=int,default=50, help="minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)")
parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)")
parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)")
parser.add_argument('--ref', type=str, help="reference fasta",required=True)
parser.add_argument('--bwa', type=str,default="bwa", help="path to bwa executable file(default=bwa)")
parser.add_argument('--fermi2', type=str,default="fermi2", help="path to fermi2 executable file (default=fermi2)")
parser.add_argument('--ropebwt2', type=str , default="ropebwt2", help="path to ropebwt2 executable file (default=ropebwt2)")
parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref")
#parser.add_argument('--skip_index', action="store_true", help="Do not generate the csi index")

#stuff related to filtering
parser.add_argument('--p_ratio', type=float,default=0.1, help="minimum discordant pair/normal pair ratio at the breakpoint junction(default=0.1)")
parser.add_argument('--r_ratio', type=float,default=0.1, help="minimum split read/coverage ratio at the breakpoint junction(default=0.1)")
parser.add_argument('--max_coverage', type=float,default=4, help="filter call if X times higher than chromosome average coverage (default=4)")
parser.add_argument('--min_contig', type=int,default=10000, help="Skip calling on small contigs (default < 10000 bp)")
parser.add_argument('-z', type=int,default=50, help="minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)")

#assembly related stuff
parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require bwa and bwa indexed ref, and will complete quicker")
parser.add_argument('--bwa', type=str,default="bwa", help="path to bwa executable file(default=bwa)")
parser.add_argument('--min_clip', type=int,default=4, help="Minimum clip reads to initiate local assembly of a region(default=4)")
parser.add_argument('--padding', type=int,default=4, help="Extend the local assembly by this number of bases (default=200bp)")
parser.add_argument('--min_pts_clips', type=int,default=3, help="min-pts parameter for the clustering of candidates for local assembly (default=3)")
parser.add_argument('--max_assembly_reads', type=int,default=100000, help="Skip assembly of regions containing too many reads (default=10000 reads)")
parser.add_argument('--max_local_assembly_region', type=int,default=2000, help="maximum size of the clip read cluster for being considered a local assembly candidate (default=2000 bp)")
parser.add_argument('--min_anchor_len', type=int,default=60, help="minimum mapped bases to be considered a clip read (default=60 bp)")
parser.add_argument('--min_clip_len', type=int,default=25, help="minimum clipped bases to be considered a clip read (default=25 bp)")
parser.add_argument('--min_contig_len', type=int,default=200, help="minimum contig length for SV analysis (default=200 bp)")
parser.add_argument('-k', type=int,default=91, help="kmer lenght used by the local assembler (default=91 bp)")
args= parser.parse_args()

if args.l < 2:
Expand All @@ -63,15 +76,7 @@ def main():

if not args.skip_assembly:
if not os.path.isfile(args.bwa) and not shutil.which(args.bwa):
print("error, BWA executable missing, add BWA to path, or specify using --bwa")
quit()

if not os.path.isfile(args.fermi2) and not shutil.which(args.fermi2):
print("error, fermi2 executable missing, add fermi2 to path, or specify using --fermi2")
quit()

if not os.path.isfile(args.ropebwt2) and not shutil.which(args.ropebwt2):
print("error, ropebwt2 executable missing, add ropebwt2 to path, or specify using --ropebwt2")
print("error, BWA executable missing, add BWA to path, or specify using --bwa, or skip local assembly (--skip_assembly)")
quit()

if not glob.glob("{}*.bwt*".format(args.ref)):
Expand Down Expand Up @@ -150,7 +155,7 @@ def main():


t=time.time()
coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False)
coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False,args.min_anchor_len,args.min_clip_len)
print("extracted signals in:")
print(t-time.time())

Expand Down
105 changes: 105 additions & 0 deletions tiddit/graphlib.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
class graph:

def __init__(self):
self.predecessors={}
self.sucessors={}
self.kmers={}
self.vertices={}
self.vertice_set=set([])
self.in_branch_points=set([])
self.out_branch_points=set([])
self.starting_points=set([])
self.end_points=set([])

#add node to the graph
def add_kmer(self,kmer,read):

if not kmer in self.kmers:
self.kmers[kmer]=set([])
self.predecessors[kmer]=set([])
self.sucessors[kmer]=set([])
self.starting_points.add(kmer)
self.end_points.add(kmer)

self.kmers[kmer].add(read)

#add vertices between nodes
def add_vertice(self,kmer1,kmer2,read):

self.add_kmer(kmer1,read)
self.add_kmer(kmer2,read)

if not kmer1 in self.vertices:
self.vertices[kmer1]={}

if kmer1 in self.end_points:
self.end_points.remove(kmer1)

if not kmer2 in self.vertices[kmer1]:
self.vertices[kmer1][kmer2]=set([])

if kmer2 in self.starting_points:
self.starting_points.remove(kmer2)

self.vertices[kmer1][kmer2].add(read)

self.vertice_set.add((kmer1,kmer2))

self.sucessors[kmer1].add(kmer2)
if len(self.sucessors[kmer1]) > 1:
self.in_branch_points.add(kmer1)

self.predecessors[kmer2].add(kmer1)
if len(self.predecessors[kmer2]) > 1:
self.out_branch_points.add(kmer2)

def delete_vertice(self,kmer1,kmer2):

if kmer1 in self.vertices:
if kmer2 in self.vertices[kmer1]:
del self.vertices[kmer1][kmer2]

if len(self.vertices[kmer1]) == 0:
del self.vertices[kmer1]

if kmer1 in self.in_branch_points:
if len(self.sucessors[kmer1]) < 3:
self.in_branch_points.remove(kmer1)

if kmer1 in self.sucessors:
if kmer2 in self.sucessors[kmer1]:
self.sucessors[kmer1].remove(kmer2)

if not self.sucessors[kmer1]:
self.end_points.add(kmer1)

if kmer2 in self.out_branch_points:
if len(self.predecessors[kmer2]) < 3:
self.out_branch_points.remove(kmer2)

if kmer1 in self.predecessors[kmer2]:
self.predecessors[kmer2].remove(kmer1)

if not len(self.predecessors[kmer2]):
self.starting_points.add(kmer2)

if (kmer1,kmer2) in self.vertice_set:
self.vertice_set.remove((kmer1,kmer2))

def delete_kmer(self,kmer):
if kmer in self.kmers:
del self.kmers[kmer]

sucessors=set([])
for sucessor in self.sucessors[kmer]:
sucessors.add(sucessor)

predecessors=set([])
for predecessor in self.predecessors[kmer]:
predecessors.add(predecessor)

for predecessor in predecessors:
self.delete_vertice(predecessor,kmer)

for sucessor in sucessors:
self.delete_vertice(kmer,sucessor)
Loading

0 comments on commit 14d1422

Please sign in to comment.