RogerLab
diff --git a/‎bin/Eukfinder.py‎
Lines changed: 155 additions & 113 deletions b/‎bin/Eukfinder.py‎
Lines changed: 155 additions & 113 deletions
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
+import json
 import os
 import re
 import sys
 import tarfile
+from pathlib import Path
+
 import ete3
 import glob
 import time
@@ -19,7 +22,7 @@
 #   Info  #
 __author__ = 'Dayana E. Salas-Leiva'
 __email__ = '[email protected]'
-__version__ = '1.2.5'
+__version__ = '1.2.4'
 #   End Info   #
 
 # database info
@@ -34,6 +37,12 @@
     "5": ["Eukfinder Env Dbs", "72 GB", "https://perun.biochem.dal.ca/Eukfinder/compressed_db/eukfinder_dbs_env_v1.2.5.tar.gz", "eukfinder_dbs_env_v1.2.5.tar.gz"]
 }
 
+_cdb = "Centrifuge_DB/Centrifuge_NewDB_Sept2020"
+_pdb = "PLAST_DB/PlastDB_Jun2020.fasta"
+_pmap = "PLAST_DB/PlastDB_Jun2020_map.txt"
+
+# JSON
+_json_path = f"{os.path.expanduser('~')}/.eukfinder/config.json"
 
 # --- preparation ---
 def trimming(bn, reads1, reads2, adapath, wsize, qscore, headcrop,
@@ -2417,20 +2426,15 @@ def perform_long_seqs(user_args):
     print(ms, sep=' ', end='\n', file=sys.stdout, flush=True)
     return 'Done'
 
-
-# NOTE: It is perhaps worth while to verify checksum in the future
+# NOTE: It is perhaps worthwhile to verify checksum in the future
 def perform_download_db(user_args):
+    path = user_args['path']
     name = user_args['name']
 
-    if user_args['path'] == ".":
-        path = os.getcwd()
-    else:
-        path = user_args['path']
-
     try:
         os.mkdir(f"{path}/{name}")
-    except FileExistsError:
-        sys.exit(f"{name} already exists in {path}, choose a different name or filesystem path!")
+    except FileNotFoundError: # TODO: should create directory instead
+        sys.exit(f"{path} does not exist,check file integrity!\nExiting...")
 
     print(f"Created {path}/{name}\n")
 
@@ -2455,11 +2459,20 @@ def perform_download_db(user_args):
                     file.close()
                     os.remove(f"{path}/{name}/{content[3]}")
 
+                print("\nUpdating default database paths in json file...")
+                new_json_data = {
+                    "centrifuge_db": f"{path}/{name}/{_cdb}",
+                    "plast_db": f"{path}/{name}/{_pdb}",
+                    "plast_map": f"{path}/{name}/{_pmap}"
+                }
+                update_json(new_json_data)
+
                 os.remove(f"{path}/{name}/{_all_db[3]}")
                 sys.exit(f"\nDatabases downloaded and decompressed in {path}/{name}, exiting...")
             elif user_input == "no":
                 while True:
                     print("\nPlease select database(s) which you would like to install, separated by spaces (e.g., 1 2).\n")
+                    # TODO: this shouldn't be hardcoded
                     print(f"1. {_database['1'][0]} - {_database['1'][1]}")
                     print(f"2. {_database['2'][0]} - {_database['2'][1]}")
                     print(f"3. {_database['3'][0]} - {_database['3'][1]}")
@@ -2550,35 +2563,101 @@ def download_db(args):
     path = args.path
     return path
 
-def parse_arguments():
+def summary_table():
+    # Change directory to 'Eukfinder_results'
+    try:
+        os.chdir("Eukfinder_results")
+        #print("Changed directory to 'Eukfinder_results'")
+    except FileNotFoundError:
+        print("Error: 'Eukfinder_results' directory not found. Exiting...")
+        sys.exit(1)
+
+    # Generate tables from FASTA/FASTQ files
+    for f in glob.glob("*.f*"):
+        basename = os.path.splitext(f)[0]  # Improved filename handling
+        output = f"{basename}.table"
+
+        cmd = f"seqkit fx2tab --length --name --header-line {f} -o {output}"
+        _ = run(cmd, stdout=PIPE, stderr=PIPE, shell=True)
+
+    # Dictionary to store results
+    summary = {
+        "Group": [],
+        "#Seq": [],
+        "Total size(bp)": []
+    }
+
+    # Process table files and collect data
+    for file in glob.glob("*.table"):
+        if file.endswith(".un.table"):
+            group = file.split(".")[-3]
+        else:
+            group = file.split(".")[-2]
+
+        # Read data, skip header
+        try:
+            data = pd.read_csv(file, sep='\t', header=0)
+        except pd.errors.EmptyDataError:
+            print(f"Warning: {file} is empty or malformed. Skipping...")
+            continue
+
+        # Append results
+        summary["Group"].append(group)
+        summary["#Seq"].append(len(data))
+        summary["Total size(bp)"].append(data['length'].sum())
+
+        # Delete temporary `.table` files
+        os.remove(file)
+        #print(f"Deleted temporary file: {file}")
+
+    # Create and save summary table
+    summary_df = pd.DataFrame(summary)
+    output_file = "summary_table.txt"
+    summary_df.to_csv(output_file, sep='\t', index=False)
+
+    print(f"Summary table has been created: Eukfinder_results/{output_file}")
+
+def update_json(new_json_data):
+
+    with open(_json_path, "w") as json_file:
+        json.dump(new_json_data, json_file, indent=4)
+
+def read_json():
+
+    with open(_json_path, "r") as json_file:
+        json_data = json.load(json_file)
+
+    return json_data
+
+def parse_arguments(json_data):
 
     myargs = {
-        '-n': ['--number-of-threads', str, 'Number of threads', True],
-        '-z': ['--number-of-chunks', str, 'Number of chunks to split a '
-                                          'file', True],
-        '-t': ['--taxonomy-update', str, 'Set to True the first '
-               'time the program is used. Otherwise set to False', True],
-        '-p': ['--plast-database', str, 'path to plast database', True],
-        '-m': ['--plast-id-map', str, 'path to taxonomy map for '
-                                      'plast database', True],
-        '--cdb': ['--centrifuge-database', str, 'path to centrifuge '
-                                                'database', True],
-        '-e': ['--e-value', float, 'threshold for plast searches', True],
-        '--pid': ['--percent_id', float, 'percentage identity for '
-                                         'plast searches', True],
-        '--cov': ['--coverage', float, 'percentage coverage for '
-                                       'plast searches', True],
-        '--max_m': ['--max_memory', str, 'Maximum memory allocated to '
-                                         'carry out an assembly', True],
-        '-k': ['--kmers', str, 'kmers to use during assembly. '
+        '-n': ['--number-of-threads', str, '20', 'Number of threads', False],
+        '-z': ['--number-of-chunks', str, '2', 'Number of chunks to split a '
+                                          'file', False],
+        '-t': ['--taxonomy-update', str, 'False', 'Set to True the first '
+               'time the program is used. Otherwise set to False', False],
+        '-p': ['--plast-database', str, json_data["plast_db"], 'path to plast database', False],
+        '-m': ['--plast-id-map', str, json_data["plast_map"], 'path to taxonomy map for '
+                                      'plast database', False],
+        '--cdb': ['--centrifuge-database', str, json_data["centrifuge_db"], 'path to centrifuge '
+                                                'database', False],
+        '-e': ['--e-value', float, 0.01, 'threshold for plast searches', False],
+        '--pid': ['--percent_id', float, 60, 'percentage identity for '
+                                         'plast searches', False],
+        '--cov': ['--coverage', float, 10, 'percentage coverage for '
+                                       'plast searches', False],
+        '--max_m': ['--max_memory', str, "300", 'Maximum memory allocated to '
+                                         'carry out an assembly', False],
+        '-k': ['--kmers', str, "21, 33, 55", 'kmers to use during assembly. '
                'These must be odd and less than 128. default is 21,33,55',
                False],
-        '--mhlen': ['--min-hit-length', int, 'Maximum memory allocated to '
-                                             'carry out an assembly', True],
-        '--pclass': ['--p-reads-class', str, 'Classification for '
-                                             'pair end reads', True],
-        '--uclass': ['--u-reads-class', str, 'Classification for '
-                                             'un-pair end reads', True]
+        '--mhlen': ['--min-hit-length', int, 25, 'Maximum memory allocated to '
+                                             'carry out an assembly', False],
+        '--pclass': ['--p-reads-class', str, None, 'Classification for '
+                                             'pair end reads', False],
+        '--uclass': ['--u-reads-class', str, None, 'Classification for '
+                                             'un-pair end reads', False]
     }
 
     parser = argparse.ArgumentParser(prog='eukfinder')
@@ -2600,12 +2679,14 @@ def parse_arguments():
         try:
             group1.add_argument(key, myargs[key][0],
                                 type=myargs[key][1],
-                                help=myargs[key][2],
-                                required=myargs[key][3])
+                                default=myargs[key][2],
+                                help=myargs[key][3],
+                                required=myargs[key][4])
         except:
             parser_short_seqs.add_argument(key, myargs[key][0],
                                            type=myargs[key][1],
-                                           help=myargs[key][2])
+                                           default=myargs[key][2],
+                                           help=myargs[key][3])
 
     #  ---  second level parser for unpair mode ---  #
     #  ---  second level parser for read_prep ---  #
@@ -2639,7 +2720,8 @@ def parse_arguments():
     group2.add_argument('-o', '--out_name', type=str,
                         help='output file basename', required=True)
     group2.add_argument('--cdb', '--centrifuge-database', type=str,
-                        help='path to centrifuge database', required=True)
+                        default= json_data["centrifuge_db"],
+                        help='path to centrifuge database', required=False)
     group2.add_argument('--qenc', '--quality-encoding', type=str,
                         help='quality enconding for trimmomatic', default='phred64', required=False)
 
@@ -2655,23 +2737,24 @@ def parse_arguments():
     group3.add_argument('--mhlen', '--min-hit-length', type=int,
                         help='minimum hit length', required=True)
     group3.add_argument('--cdb', '--centrifuge-database', type=str,
-                        help='path to centrifuge database', required=True)
+                        default= json_data["centrifuge_db"],
+                        help='path to centrifuge database', required=False)
 
     myargs_lr = {
-        '-n': ['--number-of-threads', str, 'Number of threads', True],
-        '-z': ['--number-of-chunks', str, 'Number of chunks to split a'
-                                          ' file', True],
-        '-t': ['--taxonomy-update', str, 'Set to True the first '
+        '-n': ['--number-of-threads', str, '20', 'Number of threads', False],
+        '-z': ['--number-of-chunks', str, '2', 'Number of chunks to split a'
+                                          ' file', False],
+        '-t': ['--taxonomy-update', str, 'False', 'Set to True the first '
                                          'time the program is used. '
-                                         'Otherwise set to False', True],
-        '-p': ['--plast-database', str, 'path to plast database', True],
-        '-m': ['--plast-id-map', str, 'path to taxonomy map for '
-                                      'plast database', True],
-        '-e': ['--e-value', float, 'threshold for plast searches', True],
-        '--pid': ['--percent_id', float, 'percentage identity for '
-                                         'plast searches', True],
-        '--cov': ['--coverage', float, 'percentage coverage for '
-                                       'plast searches', True],
+                                         'Otherwise set to False', False],
+        '-p': ['--plast-database', str, json_data["plast_db"], 'path to plast database', False],
+        '-m': ['--plast-id-map', str, json_data["plast_map"], 'path to taxonomy map for '
+                                      'plast database', False],
+        '-e': ['--e-value', float, 0.01, 'threshold for plast searches', False],
+        '--pid': ['--percent_id', float, 60, 'percentage identity for '
+                                         'plast searches', False],
+        '--cov': ['--coverage', float, 10, 'percentage coverage for '
+                                       'plast searches', False],
     }
 
     #  ---  second level parser for read_prep_env ---  #
@@ -2711,19 +2794,21 @@ def parse_arguments():
     parser_download_db = subparsers.add_parser("download_db")
     parser_download_db.add_argument("-n", "--name", type=str, default="eukfinder_databases",
                                     help="directory name for storing the databases")
-    parser_download_db.add_argument("-p", "--path", type=str, default=".",
+    parser_download_db.add_argument("-p", "--path", type=str, default="~/.eukfinder",
                                     help="filesystem path for storing the databases")
 
     for key in myargs_lr:
         try:
             group3.add_argument(key, myargs_lr[key][0],
                                 type=myargs_lr[key][1],
-                                help=myargs_lr[key][2],
-                                required=myargs_lr[key][3])
+                                default=myargs[key][2],
+                                help=myargs_lr[key][3],
+                                required=myargs_lr[key][4])
         except:
             parser_long_seqs.add_argument(key, myargs_lr[key][0],
                                           type=myargs_lr[key][1],
-                                          help=myargs_lr[key][2])
+                                          default=myargs[key][2],
+                                          help=myargs_lr[key][3])
 
     parser_short_seqs.set_defaults(func=short_seqs)
     parser_long_seqs.set_defaults(func=long_seqs)
@@ -2733,63 +2818,20 @@ def parse_arguments():
 
     return parser.parse_args()
 
-def summary_table():
-    # Change directory to 'Eukfinder_results'
-    try:
-        os.chdir("Eukfinder_results")
-        #print("Changed directory to 'Eukfinder_results'")
-    except FileNotFoundError:
-        print("Error: 'Eukfinder_results' directory not found. Exiting...")
-        sys.exit(1)
-
-    # Generate tables from FASTA/FASTQ files
-    for f in glob.glob("*.f*"):
-        basename = os.path.splitext(f)[0]  # Improved filename handling
-        output = f"{basename}.table"
-
-        cmd = f"seqkit fx2tab --length --name --header-line {f} -o {output}"
-        _ = run(cmd, stdout=PIPE, stderr=PIPE, shell=True)
-
-    # Dictionary to store results
-    summary = {
-        "Group": [],
-        "#Seq": [],
-        "Total size(bp)": []
-    }
-
-    # Process table files and collect data
-    for file in glob.glob("*.table"):
-        if file.endswith(".un.table"):
-            group = file.split(".")[-3]
-        else:
-            group = file.split(".")[-2]
-
-        # Read data, skip header
-        try:
-            data = pd.read_csv(file, sep='\t', header=0)
-        except pd.errors.EmptyDataError:
-            print(f"Warning: {file} is empty or malformed. Skipping...")
-            continue
-
-        # Append results
-        summary["Group"].append(group)
-        summary["#Seq"].append(len(data))
-        summary["Total size(bp)"].append(data['length'].sum())
-
-        # Delete temporary `.table` files
-        os.remove(file)
-        #print(f"Deleted temporary file: {file}")
-
-    # Create and save summary table
-    summary_df = pd.DataFrame(summary)
-    output_file = "summary_table.txt"
-    summary_df.to_csv(output_file, sep='\t', index=False)
-
-    print(f"Summary table has been created: Eukfinder_results/{output_file}")
-
-
 def main():
-    args = parse_arguments()
+    # json creation
+    if not os.path.exists(_json_path):
+        os.makedirs(os.path.dirname(_json_path), exist_ok=True)
+        update_json(
+            {
+                "centrifuge_db": "",
+                "plast_db": "",
+                "plast_map": ""
+            }
+        )
+
+    json_data = read_json()
+    args = parse_arguments(json_data)
 
     if len(sys.argv) == 1:
         print('Try Eukfinder.py -h for more information', sep=' ',