11#!/usr/bin/env python
2+ import json
23import os
34import re
45import sys
56import tarfile
7+ from pathlib import Path
8+
69import ete3
710import glob
811import time
1922# Info #
2023__author__ = 'Dayana E. Salas-Leiva'
212422- __version__ = '1.2.5 '
25+ __version__ = '1.2.4 '
2326# End Info #
2427
2528# database info
3437 "5" : ["Eukfinder Env Dbs" , "72 GB" , "https://perun.biochem.dal.ca/Eukfinder/compressed_db/eukfinder_dbs_env_v1.2.5.tar.gz" , "eukfinder_dbs_env_v1.2.5.tar.gz" ]
3538}
3639
40+ _cdb = "Centrifuge_DB/Centrifuge_NewDB_Sept2020"
41+ _pdb = "PLAST_DB/PlastDB_Jun2020.fasta"
42+ _pmap = "PLAST_DB/PlastDB_Jun2020_map.txt"
43+
44+ # JSON
45+ _json_path = f"{ os .path .expanduser ('~' )} /.eukfinder/config.json"
3746
3847# --- preparation ---
3948def trimming (bn , reads1 , reads2 , adapath , wsize , qscore , headcrop ,
@@ -2417,20 +2426,15 @@ def perform_long_seqs(user_args):
24172426 print (ms , sep = ' ' , end = '\n ' , file = sys .stdout , flush = True )
24182427 return 'Done'
24192428
2420-
2421- # NOTE: It is perhaps worth while to verify checksum in the future
2429+ # NOTE: It is perhaps worthwhile to verify checksum in the future
24222430def perform_download_db (user_args ):
2431+ path = user_args ['path' ]
24232432 name = user_args ['name' ]
24242433
2425- if user_args ['path' ] == "." :
2426- path = os .getcwd ()
2427- else :
2428- path = user_args ['path' ]
2429-
24302434 try :
24312435 os .mkdir (f"{ path } /{ name } " )
2432- except FileExistsError :
2433- sys .exit (f"{ name } already exists in { path } , choose a different name or filesystem path! " )
2436+ except FileNotFoundError : # TODO: should create directory instead
2437+ sys .exit (f"{ path } does not exist,check file integrity! \n Exiting... " )
24342438
24352439 print (f"Created { path } /{ name } \n " )
24362440
@@ -2455,11 +2459,20 @@ def perform_download_db(user_args):
24552459 file .close ()
24562460 os .remove (f"{ path } /{ name } /{ content [3 ]} " )
24572461
2462+ print ("\n Updating default database paths in json file..." )
2463+ new_json_data = {
2464+ "centrifuge_db" : f"{ path } /{ name } /{ _cdb } " ,
2465+ "plast_db" : f"{ path } /{ name } /{ _pdb } " ,
2466+ "plast_map" : f"{ path } /{ name } /{ _pmap } "
2467+ }
2468+ update_json (new_json_data )
2469+
24582470 os .remove (f"{ path } /{ name } /{ _all_db [3 ]} " )
24592471 sys .exit (f"\n Databases downloaded and decompressed in { path } /{ name } , exiting..." )
24602472 elif user_input == "no" :
24612473 while True :
24622474 print ("\n Please select database(s) which you would like to install, separated by spaces (e.g., 1 2).\n " )
2475+ # TODO: this shouldn't be hardcoded
24632476 print (f"1. { _database ['1' ][0 ]} - { _database ['1' ][1 ]} " )
24642477 print (f"2. { _database ['2' ][0 ]} - { _database ['2' ][1 ]} " )
24652478 print (f"3. { _database ['3' ][0 ]} - { _database ['3' ][1 ]} " )
@@ -2550,35 +2563,101 @@ def download_db(args):
25502563 path = args .path
25512564 return path
25522565
2553- def parse_arguments ():
2566+ def summary_table ():
2567+ # Change directory to 'Eukfinder_results'
2568+ try :
2569+ os .chdir ("Eukfinder_results" )
2570+ #print("Changed directory to 'Eukfinder_results'")
2571+ except FileNotFoundError :
2572+ print ("Error: 'Eukfinder_results' directory not found. Exiting..." )
2573+ sys .exit (1 )
2574+
2575+ # Generate tables from FASTA/FASTQ files
2576+ for f in glob .glob ("*.f*" ):
2577+ basename = os .path .splitext (f )[0 ] # Improved filename handling
2578+ output = f"{ basename } .table"
2579+
2580+ cmd = f"seqkit fx2tab --length --name --header-line { f } -o { output } "
2581+ _ = run (cmd , stdout = PIPE , stderr = PIPE , shell = True )
2582+
2583+ # Dictionary to store results
2584+ summary = {
2585+ "Group" : [],
2586+ "#Seq" : [],
2587+ "Total size(bp)" : []
2588+ }
2589+
2590+ # Process table files and collect data
2591+ for file in glob .glob ("*.table" ):
2592+ if file .endswith (".un.table" ):
2593+ group = file .split ("." )[- 3 ]
2594+ else :
2595+ group = file .split ("." )[- 2 ]
2596+
2597+ # Read data, skip header
2598+ try :
2599+ data = pd .read_csv (file , sep = '\t ' , header = 0 )
2600+ except pd .errors .EmptyDataError :
2601+ print (f"Warning: { file } is empty or malformed. Skipping..." )
2602+ continue
2603+
2604+ # Append results
2605+ summary ["Group" ].append (group )
2606+ summary ["#Seq" ].append (len (data ))
2607+ summary ["Total size(bp)" ].append (data ['length' ].sum ())
2608+
2609+ # Delete temporary `.table` files
2610+ os .remove (file )
2611+ #print(f"Deleted temporary file: {file}")
2612+
2613+ # Create and save summary table
2614+ summary_df = pd .DataFrame (summary )
2615+ output_file = "summary_table.txt"
2616+ summary_df .to_csv (output_file , sep = '\t ' , index = False )
2617+
2618+ print (f"Summary table has been created: Eukfinder_results/{ output_file } " )
2619+
2620+ def update_json (new_json_data ):
2621+
2622+ with open (_json_path , "w" ) as json_file :
2623+ json .dump (new_json_data , json_file , indent = 4 )
2624+
2625+ def read_json ():
2626+
2627+ with open (_json_path , "r" ) as json_file :
2628+ json_data = json .load (json_file )
2629+
2630+ return json_data
2631+
2632+ def parse_arguments (json_data ):
25542633
25552634 myargs = {
2556- '-n' : ['--number-of-threads' , str , 'Number of threads' , True ],
2557- '-z' : ['--number-of-chunks' , str , 'Number of chunks to split a '
2558- 'file' , True ],
2559- '-t' : ['--taxonomy-update' , str , 'Set to True the first '
2560- 'time the program is used. Otherwise set to False' , True ],
2561- '-p' : ['--plast-database' , str , 'path to plast database' , True ],
2562- '-m' : ['--plast-id-map' , str , 'path to taxonomy map for '
2563- 'plast database' , True ],
2564- '--cdb' : ['--centrifuge-database' , str , 'path to centrifuge '
2565- 'database' , True ],
2566- '-e' : ['--e-value' , float , 'threshold for plast searches' , True ],
2567- '--pid' : ['--percent_id' , float , 'percentage identity for '
2568- 'plast searches' , True ],
2569- '--cov' : ['--coverage' , float , 'percentage coverage for '
2570- 'plast searches' , True ],
2571- '--max_m' : ['--max_memory' , str , 'Maximum memory allocated to '
2572- 'carry out an assembly' , True ],
2573- '-k' : ['--kmers' , str , 'kmers to use during assembly. '
2635+ '-n' : ['--number-of-threads' , str , '20' , ' Number of threads' , False ],
2636+ '-z' : ['--number-of-chunks' , str , '2' , ' Number of chunks to split a '
2637+ 'file' , False ],
2638+ '-t' : ['--taxonomy-update' , str , 'False' , ' Set to True the first '
2639+ 'time the program is used. Otherwise set to False' , False ],
2640+ '-p' : ['--plast-database' , str , json_data [ "plast_db" ], 'path to plast database' , False ],
2641+ '-m' : ['--plast-id-map' , str , json_data [ "plast_map" ], 'path to taxonomy map for '
2642+ 'plast database' , False ],
2643+ '--cdb' : ['--centrifuge-database' , str , json_data [ "centrifuge_db" ], 'path to centrifuge '
2644+ 'database' , False ],
2645+ '-e' : ['--e-value' , float , 0.01 , 'threshold for plast searches' , False ],
2646+ '--pid' : ['--percent_id' , float , 60 , 'percentage identity for '
2647+ 'plast searches' , False ],
2648+ '--cov' : ['--coverage' , float , 10 , 'percentage coverage for '
2649+ 'plast searches' , False ],
2650+ '--max_m' : ['--max_memory' , str , "300" , 'Maximum memory allocated to '
2651+ 'carry out an assembly' , False ],
2652+ '-k' : ['--kmers' , str , "21, 33, 55" , 'kmers to use during assembly. '
25742653 'These must be odd and less than 128. default is 21,33,55' ,
25752654 False ],
2576- '--mhlen' : ['--min-hit-length' , int , 'Maximum memory allocated to '
2577- 'carry out an assembly' , True ],
2578- '--pclass' : ['--p-reads-class' , str , 'Classification for '
2579- 'pair end reads' , True ],
2580- '--uclass' : ['--u-reads-class' , str , 'Classification for '
2581- 'un-pair end reads' , True ]
2655+ '--mhlen' : ['--min-hit-length' , int , 25 , 'Maximum memory allocated to '
2656+ 'carry out an assembly' , False ],
2657+ '--pclass' : ['--p-reads-class' , str , None , 'Classification for '
2658+ 'pair end reads' , False ],
2659+ '--uclass' : ['--u-reads-class' , str , None , 'Classification for '
2660+ 'un-pair end reads' , False ]
25822661 }
25832662
25842663 parser = argparse .ArgumentParser (prog = 'eukfinder' )
@@ -2600,12 +2679,14 @@ def parse_arguments():
26002679 try :
26012680 group1 .add_argument (key , myargs [key ][0 ],
26022681 type = myargs [key ][1 ],
2603- help = myargs [key ][2 ],
2604- required = myargs [key ][3 ])
2682+ default = myargs [key ][2 ],
2683+ help = myargs [key ][3 ],
2684+ required = myargs [key ][4 ])
26052685 except :
26062686 parser_short_seqs .add_argument (key , myargs [key ][0 ],
26072687 type = myargs [key ][1 ],
2608- help = myargs [key ][2 ])
2688+ default = myargs [key ][2 ],
2689+ help = myargs [key ][3 ])
26092690
26102691 # --- second level parser for unpair mode --- #
26112692 # --- second level parser for read_prep --- #
@@ -2639,7 +2720,8 @@ def parse_arguments():
26392720 group2 .add_argument ('-o' , '--out_name' , type = str ,
26402721 help = 'output file basename' , required = True )
26412722 group2 .add_argument ('--cdb' , '--centrifuge-database' , type = str ,
2642- help = 'path to centrifuge database' , required = True )
2723+ default = json_data ["centrifuge_db" ],
2724+ help = 'path to centrifuge database' , required = False )
26432725 group2 .add_argument ('--qenc' , '--quality-encoding' , type = str ,
26442726 help = 'quality enconding for trimmomatic' , default = 'phred64' , required = False )
26452727
@@ -2655,23 +2737,24 @@ def parse_arguments():
26552737 group3 .add_argument ('--mhlen' , '--min-hit-length' , type = int ,
26562738 help = 'minimum hit length' , required = True )
26572739 group3 .add_argument ('--cdb' , '--centrifuge-database' , type = str ,
2658- help = 'path to centrifuge database' , required = True )
2740+ default = json_data ["centrifuge_db" ],
2741+ help = 'path to centrifuge database' , required = False )
26592742
26602743 myargs_lr = {
2661- '-n' : ['--number-of-threads' , str , 'Number of threads' , True ],
2662- '-z' : ['--number-of-chunks' , str , 'Number of chunks to split a'
2663- ' file' , True ],
2664- '-t' : ['--taxonomy-update' , str , 'Set to True the first '
2744+ '-n' : ['--number-of-threads' , str , '20' , ' Number of threads' , False ],
2745+ '-z' : ['--number-of-chunks' , str , '2' , ' Number of chunks to split a'
2746+ ' file' , False ],
2747+ '-t' : ['--taxonomy-update' , str , 'False' , ' Set to True the first '
26652748 'time the program is used. '
2666- 'Otherwise set to False' , True ],
2667- '-p' : ['--plast-database' , str , 'path to plast database' , True ],
2668- '-m' : ['--plast-id-map' , str , 'path to taxonomy map for '
2669- 'plast database' , True ],
2670- '-e' : ['--e-value' , float , 'threshold for plast searches' , True ],
2671- '--pid' : ['--percent_id' , float , 'percentage identity for '
2672- 'plast searches' , True ],
2673- '--cov' : ['--coverage' , float , 'percentage coverage for '
2674- 'plast searches' , True ],
2749+ 'Otherwise set to False' , False ],
2750+ '-p' : ['--plast-database' , str , json_data [ "plast_db" ], 'path to plast database' , False ],
2751+ '-m' : ['--plast-id-map' , str , json_data [ "plast_map" ], 'path to taxonomy map for '
2752+ 'plast database' , False ],
2753+ '-e' : ['--e-value' , float , 0.01 , 'threshold for plast searches' , False ],
2754+ '--pid' : ['--percent_id' , float , 60 , 'percentage identity for '
2755+ 'plast searches' , False ],
2756+ '--cov' : ['--coverage' , float , 10 , 'percentage coverage for '
2757+ 'plast searches' , False ],
26752758 }
26762759
26772760 # --- second level parser for read_prep_env --- #
@@ -2711,19 +2794,21 @@ def parse_arguments():
27112794 parser_download_db = subparsers .add_parser ("download_db" )
27122795 parser_download_db .add_argument ("-n" , "--name" , type = str , default = "eukfinder_databases" ,
27132796 help = "directory name for storing the databases" )
2714- parser_download_db .add_argument ("-p" , "--path" , type = str , default = ". " ,
2797+ parser_download_db .add_argument ("-p" , "--path" , type = str , default = "~/.eukfinder " ,
27152798 help = "filesystem path for storing the databases" )
27162799
27172800 for key in myargs_lr :
27182801 try :
27192802 group3 .add_argument (key , myargs_lr [key ][0 ],
27202803 type = myargs_lr [key ][1 ],
2721- help = myargs_lr [key ][2 ],
2722- required = myargs_lr [key ][3 ])
2804+ default = myargs [key ][2 ],
2805+ help = myargs_lr [key ][3 ],
2806+ required = myargs_lr [key ][4 ])
27232807 except :
27242808 parser_long_seqs .add_argument (key , myargs_lr [key ][0 ],
27252809 type = myargs_lr [key ][1 ],
2726- help = myargs_lr [key ][2 ])
2810+ default = myargs [key ][2 ],
2811+ help = myargs_lr [key ][3 ])
27272812
27282813 parser_short_seqs .set_defaults (func = short_seqs )
27292814 parser_long_seqs .set_defaults (func = long_seqs )
@@ -2733,63 +2818,20 @@ def parse_arguments():
27332818
27342819 return parser .parse_args ()
27352820
2736- def summary_table ():
2737- # Change directory to 'Eukfinder_results'
2738- try :
2739- os .chdir ("Eukfinder_results" )
2740- #print("Changed directory to 'Eukfinder_results'")
2741- except FileNotFoundError :
2742- print ("Error: 'Eukfinder_results' directory not found. Exiting..." )
2743- sys .exit (1 )
2744-
2745- # Generate tables from FASTA/FASTQ files
2746- for f in glob .glob ("*.f*" ):
2747- basename = os .path .splitext (f )[0 ] # Improved filename handling
2748- output = f"{ basename } .table"
2749-
2750- cmd = f"seqkit fx2tab --length --name --header-line { f } -o { output } "
2751- _ = run (cmd , stdout = PIPE , stderr = PIPE , shell = True )
2752-
2753- # Dictionary to store results
2754- summary = {
2755- "Group" : [],
2756- "#Seq" : [],
2757- "Total size(bp)" : []
2758- }
2759-
2760- # Process table files and collect data
2761- for file in glob .glob ("*.table" ):
2762- if file .endswith (".un.table" ):
2763- group = file .split ("." )[- 3 ]
2764- else :
2765- group = file .split ("." )[- 2 ]
2766-
2767- # Read data, skip header
2768- try :
2769- data = pd .read_csv (file , sep = '\t ' , header = 0 )
2770- except pd .errors .EmptyDataError :
2771- print (f"Warning: { file } is empty or malformed. Skipping..." )
2772- continue
2773-
2774- # Append results
2775- summary ["Group" ].append (group )
2776- summary ["#Seq" ].append (len (data ))
2777- summary ["Total size(bp)" ].append (data ['length' ].sum ())
2778-
2779- # Delete temporary `.table` files
2780- os .remove (file )
2781- #print(f"Deleted temporary file: {file}")
2782-
2783- # Create and save summary table
2784- summary_df = pd .DataFrame (summary )
2785- output_file = "summary_table.txt"
2786- summary_df .to_csv (output_file , sep = '\t ' , index = False )
2787-
2788- print (f"Summary table has been created: Eukfinder_results/{ output_file } " )
2789-
2790-
27912821def main ():
2792- args = parse_arguments ()
2822+ # json creation
2823+ if not os .path .exists (_json_path ):
2824+ os .makedirs (os .path .dirname (_json_path ), exist_ok = True )
2825+ update_json (
2826+ {
2827+ "centrifuge_db" : "" ,
2828+ "plast_db" : "" ,
2829+ "plast_map" : ""
2830+ }
2831+ )
2832+
2833+ json_data = read_json ()
2834+ args = parse_arguments (json_data )
27932835
27942836 if len (sys .argv ) == 1 :
27952837 print ('Try Eukfinder.py -h for more information' , sep = ' ' ,
0 commit comments