changes for version 1.3.3

ibn-salem · ibn-salem · commit 1361e678b9a2 · 2021-04-29T12:55:13.000+02:00
diff --git a/README.md b/README.md
@@ -49,22 +49,16 @@ Install python modules (we strongly recommend installation via conda):
   conda install -c bioconda pysam=0.15.2 star=2.6.1b star-fusion=1.5.0 bowtie2=2.3.4.3 bx-python=0.8.2 crossmap=0.2.7
   ```
 
-
- - R (>= 3.5.1)
+ - R (>= 3.6.0)
  - R packages: 
-    - optparse
-    - tidyverse
-    - randomForest
-    - Biostrings
-    - GenomicRanges
-    - BSgenome    
-    - bindrcpp
+    - optparse (1.6.4)
+    - tidyverse (1.3.0)
+    - randomForest (4.6-14)
 
   Install packages within R by
   
   ```
-  install.packages(c("optparse", "tidyverse", "randomForest", "Biostrings","BiocManager","BSgenome","optparse"))
-  BiocManager::install("GenomicRanges") #bioconductor package
+  install.packages(c("optparse", "tidyverse", "randomForest"))
   ```
   
 ## Usage
@@ -86,10 +80,10 @@ processing.py \
 
 Before executing the example command
 
- - [ ] rename `build_env.sh.smaple` into `build_env.sh` and configure content. 
- - [ ] rename `config.py.smaple` into `config.py` and configure content.
- - [ ] rename `blacklist.txt.sample` into `blacklist.txt`.
+ - rename `build_env.sh.smaple` into `build_env.sh` and configure content. 
+ - rename `config.py.smaple` into `config.py` and configure content.
+ - rename `blacklist.txt.sample` into `blacklist.txt`.
 
 ```
 python processing.py -i test_case/SRR1659960_05pc_* -o test_easyfuse_1.3.1/
-```
+```
diff --git a/config.py.sample b/config.py.sample
@@ -9,7 +9,7 @@ import os
 # 3) Which reference data shall be used (ref_trans_version & ref_genome_build)
 # 4) To whom shall slurm mails be sent to (receiver)
 
-__version__ = "1.3.2"
+version = "1.3.4"
 
 pipeline_name = "EasyFuse"
 
@@ -162,4 +162,4 @@ other_files = {
     "soapfuse_cfg": "/path/to/soapfuse_config/config_h<release>.txt",
     "soapfuse_cfg_mm10": "/path/to/soapfuse_config/config_m<release>.txt",
     "easyfuse_model": os.path.join(module_dir, "data", "model", "Fusion_modeling_FFPE_deploy_v01.model_full_data.EasyFuse_model.rds")
-}
+}
diff --git a/misc/queueing.py b/misc/queueing.py
@@ -23,6 +23,8 @@ def get_jobs_by_name(name, system="slurm"):
         return get_jobs_by_name_slurm(name)
     elif system == "pbs":
         return get_jobs_by_name_pbs(name)
+    else:
+        return []
 
 def get_jobs_by_name_pbs(name):
     jobs = []
@@ -70,12 +72,13 @@ def submit(job_name, cmd, cores, mem_usage, output_results_folder, dependencies,
     elif sched == "pbs":
         _submit_pbs(job_name, cmd, cores, mem_usage, output_results_folder, dependencies, module_file)
     else:
-        _submit_nonqueue(cmd, module_file)
+        _submit_nonqueue(job_name, cmd, module_file)
     
-def _submit_nonqueue(cmd, module_file=""):
+def _submit_nonqueue(job_name, cmd, module_file=""):
 #    if module_file:
 #        cmd = " && ".join(["source " + module_file, " ".join(cmd)]).split(" ")
-#    print(cmd)
+    print("Running {}".format(job_name))
+    print("CMD: {}".format(cmd))
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
     (stdoutdata, stderrdata) = p.communicate()
     print(stdoutdata)
diff --git a/processing.py b/processing.py
@@ -77,7 +77,7 @@ def run(self, tool_num_cutoff):
             # urla - note: would be happy to get the dependencies with a stacked LC, but is atm to complicated for me ^^
             dependency = []
             for sample in sample_list:
-                dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample)))
+                dependency.extend(Queueing.get_jobs_by_name("Fetchdata-{}".format(sample), cfg.queueing_system))
             modelling_string = ""
             if cfg.other_files["easyfuse_model"]:
                 modelling_string = " --model_predictions"
@@ -109,6 +109,7 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
 #        kallisto_index_path = indices["kallisto"]
 #        pizzly_cache_path = "{}.pizzlyCache.txt".format(genes_gtf_path)
         starfusion_index_path = indices["starfusion"]
+        fusioncatcher_index_path = indices["fusioncatcher"]
         infusion_cfg_path = other_files["infusion_cfg"]
 #        starchip_param_path = other_files["starchip_param"]
 
@@ -133,6 +134,9 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
         infusion_path = os.path.join(fusion_path, "infusion")
         soapfuse_path = os.path.join(fusion_path, "soapfuse")
         fetchdata_path = os.path.join(self.working_dir, "Sample_{}".format(sample_id), "fetchdata")
+        fastqc_1 = os.path.join(qc_path, sample_id + "_R1_fastqc", "fastqc_data.txt")
+        fastqc_2 = os.path.join(qc_path, sample_id + "_R2_fastqc", "fastqc_data.txt")
+
 
         for folder in [
                 output_results_path, 
@@ -163,9 +167,9 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
         # Define cmd strings for each program
         # urla: mapsplice requires gunzip'd read files and process substitutions don't seem to work in slurm scripts...
         #       process substitution do somehow not work from this script - c/p the command line to the terminal, however, works w/o issues?!
-        cmd_fastqc = "{} --nogroup --extract -t 6 -o {} {} {}".format(cmds["fastqc"], qc_path, fq1, fq2)
-        cmd_qc_parser = "{} -i {}/*/fastqc_data.txt -o {}".format(os.path.join(module_dir, "misc", "qc_parser.py"), qc_path, qc_table_path)
-        cmd_skewer = "{} -q {} -i {} {} -o {}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path)
+        cmd_fastqc = "{0} --nogroup --extract -t 6 -o {1} {2} {3}".format(cmds["fastqc"], qc_path, fq1, fq2)
+        cmd_qc_parser = "{0} -i {1} {2} -o {3}".format(os.path.join(module_dir, "misc", "qc_parser.py"), fastqc_1, fastqc_2, qc_table_path)
+        cmd_skewer = "{0} -q {1} -i {2} {3} -o {4}".format(os.path.join(module_dir, "tool_wrapper", "skewer_wrapper.py"), qc_table_path, fq1, fq2, skewer_path)
 
         fq0 = ""
         if "QC" in tools:
@@ -192,12 +196,12 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
         cmd_star = "{0} --genomeDir {1} --outFileNamePrefix waiting_for_output_string --runThreadN waiting_for_cpu_number --runMode alignReads --readFilesIn {2} {3} --readFilesCommand zcat --chimSegmentMin 10 --chimJunctionOverhangMin 10 --alignSJDBoverhangMin 10 --alignMatesGapMax {4} --alignIntronMax {4} --chimSegmentReadGapMax 3 --alignSJstitchMismatchNmax 5 -1 5 5 --seedSearchStartLmax 20 --winAnchorMultimapNmax 50 --outSAMtype BAM SortedByCoordinate --chimOutType Junctions SeparateSAMold --chimOutJunctionFormat 1".format(cmds["star"], star_index_path, fq1, fq2, cfg.max_dist_proper_pair)
         # (3) Mapslice
         # urla: the "keep" parameter requires gunzip >= 1.6
-        cmd_extr_fastq1 = "gunzip {0} --keep".format(fq1)
-        cmd_extr_fastq2 = "gunzip {0} --keep".format(fq2)
+        cmd_extr_fastq1 = "gunzip --keep {0}".format(fq1)
+        cmd_extr_fastq2 = "gunzip --keep {0}".format(fq2)
         # Added python interpreter to circumvent external hardcoded shell script
         cmd_mapsplice = "python {0} --chromosome-dir {1} -x {2} -1 {3} -2 {4} --threads waiting_for_cpu_number --output {5} --qual-scale phred33 --bam --seglen 20 --min-map-len 40 --gene-gtf {6} --fusion".format(cmds["mapsplice"], genome_chrs_path, bowtie_index_path, fq1[:-3], fq2[:-3], mapsplice_path, genes_gtf_path)
         # (4) Fusiocatcher
-        cmd_fusioncatcher = "{0} --input {1} --output {2} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_path)
+        cmd_fusioncatcher = "{0} --input {1} --data {2} --output {3} -p waiting_for_cpu_number".format(cmds["fusioncatcher"], ",".join([fq1, fq2]), fusioncatcher_index_path, fusioncatcher_path)
         # star-fusion and star-chip can be run upon a previous star run (this MUST NOT be the star_filter run, but the star_expression run)
         # (5)
         cmd_starfusion = "{0} --chimeric_junction {1} --genome_lib_dir {2} --CPU waiting_for_cpu_number --output_dir {3}".format(cmds["starfusion"], "{}_Chimeric.out.junction".format(os.path.join(star_path, sample_id)), starfusion_index_path, starfusion_path)
@@ -317,36 +321,37 @@ def execute_pipeline(self, fq1, fq2, sample_id, ref_genome, ref_trans, tool_num_
                     exe_cmds[i] = exe_cmds[i].replace("waiting_for_output_string", exe_path[i]).replace("waiting_for_cpu_number", str(cpu))
                 cmd = " && ".join([exe_cmds[i], cmd_samples + tool])
                 # Managing slurm dependencies
+                que_sys = cfg.queueing_system
                 if tool == "Pizzly":
-                    dependency = Queueing.get_jobs_by_name("Kallisto-{0}".format(sample_id))
+                    dependency = Queueing.get_jobs_by_name("Kallisto-{0}".format(sample_id), que_sys)
                 elif tool == "Starfusion" or tool == "Starchip":
-                    dependency = Queueing.get_jobs_by_name("Star-{0}".format(sample_id))
+                    dependency = Queueing.get_jobs_by_name("Star-{0}".format(sample_id), que_sys)
                 elif tool == "Fetchdata":
-                    dependency = Queueing.get_jobs_by_name(sample_id)
+                    dependency = Queueing.get_jobs_by_name(sample_id, que_sys)
                 elif tool == "Assembly":
-                    dependency = Queueing.get_jobs_by_name("Fetchdata-{0}".format(sample_id))
+                    dependency = Queueing.get_jobs_by_name("Fetchdata-{0}".format(sample_id), que_sys)
                 elif tool == "ReadFilter":
-                    dependency = Queueing.get_jobs_by_name("QC-{0}".format(sample_id))
-                #                else:
-                dependency.extend(Queueing.get_jobs_by_name("Readfilter-{0}".format(sample_id)))
-                dependency.extend(Queueing.get_jobs_by_name("QC-{0}".format(sample_id)))
+                    dependency = Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys)
+                dependency.extend(Queueing.get_jobs_by_name("Readfilter-{0}".format(sample_id), que_sys))
+                dependency.extend(Queueing.get_jobs_by_name("QC-{0}".format(sample_id), que_sys))
                 self.logger.debug("Submitting slurm job: CMD - {0}; PATH - {1}; DEPS - {2}".format(cmd, exe_path[i], dependency))
                 self.submit_job(uid, cmd, cpu, mem, exe_path[i], dependency, "")
             else:
                 self.logger.info("Skipping {0} as it is not selected for execution (Selected are: {1})".format(tool, tools))
 
     def submit_job(self, uid, cmd, cores, mem_usage, output_results_folder, dependencies, mail):
         """Submit job to slurm scheduling"""
-        already_running = Queueing.get_jobs_by_name(uid)
+        que_sys = cfg.queueing_system
+        already_running = Queueing.get_jobs_by_name(uid, que_sys)
         if not already_running:
             # urla: for compatibility reasons (and to be independent of shell commands), concatenated commands are splitted again,
             #       dependencies within the splitted groups updated and everything submitted sequentially to the queueing system
             module_file = os.path.join(cfg.module_dir, "build_env.sh")
-            que_sys = cfg.queueing_system
+
             for i, cmd_split in enumerate(cmd.split(" && ")):
                 if not que_sys in ["slurm", "pbs"]:
                     cmd_split = cmd_split.split(" ")
-                dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1)))
+                dependencies.extend(Queueing.get_jobs_by_name("{0}_CMD{1}".format(uid, i - 1), que_sys))
                 Queueing.submit("{0}_CMD{1}".format(uid, i), cmd_split, cores, mem_usage, output_results_folder, dependencies, cfg.partition, cfg.user, cfg.time_limit, mail, module_file, que_sys)
                 time.sleep(0.5)
         else:
@@ -365,7 +370,7 @@ def main():
 
     # if version is request, print it and exit
     if args.version:
-        print(cfg.version)
+        print(cfg.__version__)
         sys.exit(0)
 
     script_call = "python {} -i {} -o {}".format(os.path.realpath(__file__), " ".join([os.path.abspath(x) for x in args.input_paths]), os.path.abspath(args.output_folder))
diff --git a/summarize_data.py b/summarize_data.py
@@ -11,8 +11,6 @@
 import time
 import argparse
 
-import pandas as pd
-import seaborn as sns
 from join_data import DataJoining
 from misc.samples import SamplesDB
 import misc.io_methods as IOMethods