Skip to content

Commit a3395d1

Browse files
authored
Merge pull request #31 from RogerLab/15-cmds
15 cmds
2 parents 6e1110a + f676b54 commit a3395d1

File tree

2 files changed

+155
-197
lines changed

2 files changed

+155
-197
lines changed

bin/Eukfinder.py

Lines changed: 155 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
#!/usr/bin/env python
2+
import json
23
import os
34
import re
45
import sys
56
import tarfile
7+
from pathlib import Path
8+
69
import ete3
710
import glob
811
import time
@@ -19,7 +22,7 @@
1922
# Info #
2023
__author__ = 'Dayana E. Salas-Leiva'
2124
__email__ = '[email protected]'
22-
__version__ = '1.2.5'
25+
__version__ = '1.2.4'
2326
# End Info #
2427

2528
# database info
@@ -34,6 +37,12 @@
3437
"5": ["Eukfinder Env Dbs", "72 GB", "https://perun.biochem.dal.ca/Eukfinder/compressed_db/eukfinder_dbs_env_v1.2.5.tar.gz", "eukfinder_dbs_env_v1.2.5.tar.gz"]
3538
}
3639

40+
_cdb = "Centrifuge_DB/Centrifuge_NewDB_Sept2020"
41+
_pdb = "PLAST_DB/PlastDB_Jun2020.fasta"
42+
_pmap = "PLAST_DB/PlastDB_Jun2020_map.txt"
43+
44+
# JSON
45+
_json_path = f"{os.path.expanduser('~')}/.eukfinder/config.json"
3746

3847
# --- preparation ---
3948
def trimming(bn, reads1, reads2, adapath, wsize, qscore, headcrop,
@@ -2417,20 +2426,15 @@ def perform_long_seqs(user_args):
24172426
print(ms, sep=' ', end='\n', file=sys.stdout, flush=True)
24182427
return 'Done'
24192428

2420-
2421-
# NOTE: It is perhaps worth while to verify checksum in the future
2429+
# NOTE: It is perhaps worthwhile to verify checksum in the future
24222430
def perform_download_db(user_args):
2431+
path = user_args['path']
24232432
name = user_args['name']
24242433

2425-
if user_args['path'] == ".":
2426-
path = os.getcwd()
2427-
else:
2428-
path = user_args['path']
2429-
24302434
try:
24312435
os.mkdir(f"{path}/{name}")
2432-
except FileExistsError:
2433-
sys.exit(f"{name} already exists in {path}, choose a different name or filesystem path!")
2436+
except FileNotFoundError: # TODO: should create directory instead
2437+
sys.exit(f"{path} does not exist,check file integrity!\nExiting...")
24342438

24352439
print(f"Created {path}/{name}\n")
24362440

@@ -2455,11 +2459,20 @@ def perform_download_db(user_args):
24552459
file.close()
24562460
os.remove(f"{path}/{name}/{content[3]}")
24572461

2462+
print("\nUpdating default database paths in json file...")
2463+
new_json_data = {
2464+
"centrifuge_db": f"{path}/{name}/{_cdb}",
2465+
"plast_db": f"{path}/{name}/{_pdb}",
2466+
"plast_map": f"{path}/{name}/{_pmap}"
2467+
}
2468+
update_json(new_json_data)
2469+
24582470
os.remove(f"{path}/{name}/{_all_db[3]}")
24592471
sys.exit(f"\nDatabases downloaded and decompressed in {path}/{name}, exiting...")
24602472
elif user_input == "no":
24612473
while True:
24622474
print("\nPlease select database(s) which you would like to install, separated by spaces (e.g., 1 2).\n")
2475+
# TODO: this shouldn't be hardcoded
24632476
print(f"1. {_database['1'][0]} - {_database['1'][1]}")
24642477
print(f"2. {_database['2'][0]} - {_database['2'][1]}")
24652478
print(f"3. {_database['3'][0]} - {_database['3'][1]}")
@@ -2550,35 +2563,101 @@ def download_db(args):
25502563
path = args.path
25512564
return path
25522565

2553-
def parse_arguments():
2566+
def summary_table():
2567+
# Change directory to 'Eukfinder_results'
2568+
try:
2569+
os.chdir("Eukfinder_results")
2570+
#print("Changed directory to 'Eukfinder_results'")
2571+
except FileNotFoundError:
2572+
print("Error: 'Eukfinder_results' directory not found. Exiting...")
2573+
sys.exit(1)
2574+
2575+
# Generate tables from FASTA/FASTQ files
2576+
for f in glob.glob("*.f*"):
2577+
basename = os.path.splitext(f)[0] # Improved filename handling
2578+
output = f"{basename}.table"
2579+
2580+
cmd = f"seqkit fx2tab --length --name --header-line {f} -o {output}"
2581+
_ = run(cmd, stdout=PIPE, stderr=PIPE, shell=True)
2582+
2583+
# Dictionary to store results
2584+
summary = {
2585+
"Group": [],
2586+
"#Seq": [],
2587+
"Total size(bp)": []
2588+
}
2589+
2590+
# Process table files and collect data
2591+
for file in glob.glob("*.table"):
2592+
if file.endswith(".un.table"):
2593+
group = file.split(".")[-3]
2594+
else:
2595+
group = file.split(".")[-2]
2596+
2597+
# Read data, skip header
2598+
try:
2599+
data = pd.read_csv(file, sep='\t', header=0)
2600+
except pd.errors.EmptyDataError:
2601+
print(f"Warning: {file} is empty or malformed. Skipping...")
2602+
continue
2603+
2604+
# Append results
2605+
summary["Group"].append(group)
2606+
summary["#Seq"].append(len(data))
2607+
summary["Total size(bp)"].append(data['length'].sum())
2608+
2609+
# Delete temporary `.table` files
2610+
os.remove(file)
2611+
#print(f"Deleted temporary file: {file}")
2612+
2613+
# Create and save summary table
2614+
summary_df = pd.DataFrame(summary)
2615+
output_file = "summary_table.txt"
2616+
summary_df.to_csv(output_file, sep='\t', index=False)
2617+
2618+
print(f"Summary table has been created: Eukfinder_results/{output_file}")
2619+
2620+
def update_json(new_json_data):
2621+
2622+
with open(_json_path, "w") as json_file:
2623+
json.dump(new_json_data, json_file, indent=4)
2624+
2625+
def read_json():
2626+
2627+
with open(_json_path, "r") as json_file:
2628+
json_data = json.load(json_file)
2629+
2630+
return json_data
2631+
2632+
def parse_arguments(json_data):
25542633

25552634
myargs = {
2556-
'-n': ['--number-of-threads', str, 'Number of threads', True],
2557-
'-z': ['--number-of-chunks', str, 'Number of chunks to split a '
2558-
'file', True],
2559-
'-t': ['--taxonomy-update', str, 'Set to True the first '
2560-
'time the program is used. Otherwise set to False', True],
2561-
'-p': ['--plast-database', str, 'path to plast database', True],
2562-
'-m': ['--plast-id-map', str, 'path to taxonomy map for '
2563-
'plast database', True],
2564-
'--cdb': ['--centrifuge-database', str, 'path to centrifuge '
2565-
'database', True],
2566-
'-e': ['--e-value', float, 'threshold for plast searches', True],
2567-
'--pid': ['--percent_id', float, 'percentage identity for '
2568-
'plast searches', True],
2569-
'--cov': ['--coverage', float, 'percentage coverage for '
2570-
'plast searches', True],
2571-
'--max_m': ['--max_memory', str, 'Maximum memory allocated to '
2572-
'carry out an assembly', True],
2573-
'-k': ['--kmers', str, 'kmers to use during assembly. '
2635+
'-n': ['--number-of-threads', str, '20', 'Number of threads', False],
2636+
'-z': ['--number-of-chunks', str, '2', 'Number of chunks to split a '
2637+
'file', False],
2638+
'-t': ['--taxonomy-update', str, 'False', 'Set to True the first '
2639+
'time the program is used. Otherwise set to False', False],
2640+
'-p': ['--plast-database', str, json_data["plast_db"], 'path to plast database', False],
2641+
'-m': ['--plast-id-map', str, json_data["plast_map"], 'path to taxonomy map for '
2642+
'plast database', False],
2643+
'--cdb': ['--centrifuge-database', str, json_data["centrifuge_db"], 'path to centrifuge '
2644+
'database', False],
2645+
'-e': ['--e-value', float, 0.01, 'threshold for plast searches', False],
2646+
'--pid': ['--percent_id', float, 60, 'percentage identity for '
2647+
'plast searches', False],
2648+
'--cov': ['--coverage', float, 10, 'percentage coverage for '
2649+
'plast searches', False],
2650+
'--max_m': ['--max_memory', str, "300", 'Maximum memory allocated to '
2651+
'carry out an assembly', False],
2652+
'-k': ['--kmers', str, "21, 33, 55", 'kmers to use during assembly. '
25742653
'These must be odd and less than 128. default is 21,33,55',
25752654
False],
2576-
'--mhlen': ['--min-hit-length', int, 'Maximum memory allocated to '
2577-
'carry out an assembly', True],
2578-
'--pclass': ['--p-reads-class', str, 'Classification for '
2579-
'pair end reads', True],
2580-
'--uclass': ['--u-reads-class', str, 'Classification for '
2581-
'un-pair end reads', True]
2655+
'--mhlen': ['--min-hit-length', int, 25, 'Maximum memory allocated to '
2656+
'carry out an assembly', False],
2657+
'--pclass': ['--p-reads-class', str, None, 'Classification for '
2658+
'pair end reads', False],
2659+
'--uclass': ['--u-reads-class', str, None, 'Classification for '
2660+
'un-pair end reads', False]
25822661
}
25832662

25842663
parser = argparse.ArgumentParser(prog='eukfinder')
@@ -2600,12 +2679,14 @@ def parse_arguments():
26002679
try:
26012680
group1.add_argument(key, myargs[key][0],
26022681
type=myargs[key][1],
2603-
help=myargs[key][2],
2604-
required=myargs[key][3])
2682+
default=myargs[key][2],
2683+
help=myargs[key][3],
2684+
required=myargs[key][4])
26052685
except:
26062686
parser_short_seqs.add_argument(key, myargs[key][0],
26072687
type=myargs[key][1],
2608-
help=myargs[key][2])
2688+
default=myargs[key][2],
2689+
help=myargs[key][3])
26092690

26102691
# --- second level parser for unpair mode --- #
26112692
# --- second level parser for read_prep --- #
@@ -2639,7 +2720,8 @@ def parse_arguments():
26392720
group2.add_argument('-o', '--out_name', type=str,
26402721
help='output file basename', required=True)
26412722
group2.add_argument('--cdb', '--centrifuge-database', type=str,
2642-
help='path to centrifuge database', required=True)
2723+
default= json_data["centrifuge_db"],
2724+
help='path to centrifuge database', required=False)
26432725
group2.add_argument('--qenc', '--quality-encoding', type=str,
26442726
help='quality enconding for trimmomatic', default='phred64', required=False)
26452727

@@ -2655,23 +2737,24 @@ def parse_arguments():
26552737
group3.add_argument('--mhlen', '--min-hit-length', type=int,
26562738
help='minimum hit length', required=True)
26572739
group3.add_argument('--cdb', '--centrifuge-database', type=str,
2658-
help='path to centrifuge database', required=True)
2740+
default= json_data["centrifuge_db"],
2741+
help='path to centrifuge database', required=False)
26592742

26602743
myargs_lr = {
2661-
'-n': ['--number-of-threads', str, 'Number of threads', True],
2662-
'-z': ['--number-of-chunks', str, 'Number of chunks to split a'
2663-
' file', True],
2664-
'-t': ['--taxonomy-update', str, 'Set to True the first '
2744+
'-n': ['--number-of-threads', str, '20', 'Number of threads', False],
2745+
'-z': ['--number-of-chunks', str, '2', 'Number of chunks to split a'
2746+
' file', False],
2747+
'-t': ['--taxonomy-update', str, 'False', 'Set to True the first '
26652748
'time the program is used. '
2666-
'Otherwise set to False', True],
2667-
'-p': ['--plast-database', str, 'path to plast database', True],
2668-
'-m': ['--plast-id-map', str, 'path to taxonomy map for '
2669-
'plast database', True],
2670-
'-e': ['--e-value', float, 'threshold for plast searches', True],
2671-
'--pid': ['--percent_id', float, 'percentage identity for '
2672-
'plast searches', True],
2673-
'--cov': ['--coverage', float, 'percentage coverage for '
2674-
'plast searches', True],
2749+
'Otherwise set to False', False],
2750+
'-p': ['--plast-database', str, json_data["plast_db"], 'path to plast database', False],
2751+
'-m': ['--plast-id-map', str, json_data["plast_map"], 'path to taxonomy map for '
2752+
'plast database', False],
2753+
'-e': ['--e-value', float, 0.01, 'threshold for plast searches', False],
2754+
'--pid': ['--percent_id', float, 60, 'percentage identity for '
2755+
'plast searches', False],
2756+
'--cov': ['--coverage', float, 10, 'percentage coverage for '
2757+
'plast searches', False],
26752758
}
26762759

26772760
# --- second level parser for read_prep_env --- #
@@ -2711,19 +2794,21 @@ def parse_arguments():
27112794
parser_download_db = subparsers.add_parser("download_db")
27122795
parser_download_db.add_argument("-n", "--name", type=str, default="eukfinder_databases",
27132796
help="directory name for storing the databases")
2714-
parser_download_db.add_argument("-p", "--path", type=str, default=".",
2797+
parser_download_db.add_argument("-p", "--path", type=str, default="~/.eukfinder",
27152798
help="filesystem path for storing the databases")
27162799

27172800
for key in myargs_lr:
27182801
try:
27192802
group3.add_argument(key, myargs_lr[key][0],
27202803
type=myargs_lr[key][1],
2721-
help=myargs_lr[key][2],
2722-
required=myargs_lr[key][3])
2804+
default=myargs[key][2],
2805+
help=myargs_lr[key][3],
2806+
required=myargs_lr[key][4])
27232807
except:
27242808
parser_long_seqs.add_argument(key, myargs_lr[key][0],
27252809
type=myargs_lr[key][1],
2726-
help=myargs_lr[key][2])
2810+
default=myargs[key][2],
2811+
help=myargs_lr[key][3])
27272812

27282813
parser_short_seqs.set_defaults(func=short_seqs)
27292814
parser_long_seqs.set_defaults(func=long_seqs)
@@ -2733,63 +2818,20 @@ def parse_arguments():
27332818

27342819
return parser.parse_args()
27352820

2736-
def summary_table():
2737-
# Change directory to 'Eukfinder_results'
2738-
try:
2739-
os.chdir("Eukfinder_results")
2740-
#print("Changed directory to 'Eukfinder_results'")
2741-
except FileNotFoundError:
2742-
print("Error: 'Eukfinder_results' directory not found. Exiting...")
2743-
sys.exit(1)
2744-
2745-
# Generate tables from FASTA/FASTQ files
2746-
for f in glob.glob("*.f*"):
2747-
basename = os.path.splitext(f)[0] # Improved filename handling
2748-
output = f"{basename}.table"
2749-
2750-
cmd = f"seqkit fx2tab --length --name --header-line {f} -o {output}"
2751-
_ = run(cmd, stdout=PIPE, stderr=PIPE, shell=True)
2752-
2753-
# Dictionary to store results
2754-
summary = {
2755-
"Group": [],
2756-
"#Seq": [],
2757-
"Total size(bp)": []
2758-
}
2759-
2760-
# Process table files and collect data
2761-
for file in glob.glob("*.table"):
2762-
if file.endswith(".un.table"):
2763-
group = file.split(".")[-3]
2764-
else:
2765-
group = file.split(".")[-2]
2766-
2767-
# Read data, skip header
2768-
try:
2769-
data = pd.read_csv(file, sep='\t', header=0)
2770-
except pd.errors.EmptyDataError:
2771-
print(f"Warning: {file} is empty or malformed. Skipping...")
2772-
continue
2773-
2774-
# Append results
2775-
summary["Group"].append(group)
2776-
summary["#Seq"].append(len(data))
2777-
summary["Total size(bp)"].append(data['length'].sum())
2778-
2779-
# Delete temporary `.table` files
2780-
os.remove(file)
2781-
#print(f"Deleted temporary file: {file}")
2782-
2783-
# Create and save summary table
2784-
summary_df = pd.DataFrame(summary)
2785-
output_file = "summary_table.txt"
2786-
summary_df.to_csv(output_file, sep='\t', index=False)
2787-
2788-
print(f"Summary table has been created: Eukfinder_results/{output_file}")
2789-
2790-
27912821
def main():
2792-
args = parse_arguments()
2822+
# json creation
2823+
if not os.path.exists(_json_path):
2824+
os.makedirs(os.path.dirname(_json_path), exist_ok=True)
2825+
update_json(
2826+
{
2827+
"centrifuge_db": "",
2828+
"plast_db": "",
2829+
"plast_map": ""
2830+
}
2831+
)
2832+
2833+
json_data = read_json()
2834+
args = parse_arguments(json_data)
27932835

27942836
if len(sys.argv) == 1:
27952837
print('Try Eukfinder.py -h for more information', sep=' ',

0 commit comments

Comments
 (0)