labgem
diff --git a/‎create_completion_matrix.py‎
Lines changed: 211 additions & 0 deletions b/‎create_completion_matrix.py‎
Lines changed: 211 additions & 0 deletions
diff --git a/‎create_pf_file.py‎
Lines changed: 94 additions & 0 deletions b/‎create_pf_file.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎create_pf_file.sh‎
Lines changed: 69 additions & 0 deletions b/‎create_pf_file.sh‎
Lines changed: 69 additions & 0 deletions
@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env/ python3
+
+
+import os
+import sys
+import pythoncyc
+import argparse
+import subprocess
+from pythoncyc import PTools as PTools
+from pythoncyc.PTools import PToolsError as PToolsError
+from pythoncyc.PTools import PythonCycError as PythonCycError
+
+#Usage : python3 dev_pythoncyc.py -p META
+
+
+def close_pgdb_wosaving(pgdb):
+    try:
+        r = PTools.sendQueryToPTools('(close-kb :kb (kb-of-organism "'+pgdb._orgid+'") :save-updates-p nil)')
+    except PToolsError as msg:
+        raise PythonCycError('Pathway Tools was unable to close KB of organism (orgid) {orgid}. More specifically:  {msg}'.format(orgid=pgdb._orgid, msg=msg))
+    return(r)
+
+
+def get_genes_of_reaction(rxn, pgdb):
+    genes = set()
+    for gene in pgdb.genes_of_reaction(rxn):
+        genes.add(gene)
+    for enz in pgdb.enzymes_of_reaction(rxn):
+        for gene in pgdb.genes_of_protein(enz):
+            genes.add(gene)
+    return(genes)
+
+
+def write_pathway(pgdb):
+    with open("metacyc_pathways.tsv", "w") as pathways:
+        with open("metacyc_reactions_by_pathway.tsv", "w") as reactions:
+            header1="Pathway_Id\tCommon_name\n"
+            pathways.write(header1)
+            header2="Pathway_Id\tReaction_Id\tSpontaneous\tOrphan\tOrphan_in_Metacyc\n"
+            reactions.write(header2)
+            for path in pgdb.all_pathways(selector='all', base=True):
+                to_write=path.split("|")[1]+"\t"+pgdb[path].common_name+"\n"
+                pathways.write(to_write)
+                for rxn in pgdb[path].reaction_list:
+                    orphan_in_metacyc = "FALSE"
+                    orphan = "NA"
+                    spontaneous = "FALSE"
+                    if pgdb[rxn].spontaneous_p != None and pgdb[rxn].spontaneous_p == True:
+                        spontaneous = "TRUE"
+                    if len(get_genes_of_reaction(rxn, pgdb)) == 0 and spontaneous == "FALSE":
+                        orphan_in_metacyc = "TRUE"
+                    if pgdb[rxn].orphan_p != None:
+                        if pgdb[rxn].orphan_p[0] == "|NO|":
+                            orphan = "FALSE"
+                        else:
+                            orphan = "TRUE"
+                    to_write_r = path.split("|")[1]+"\t"+rxn.split("|")[1]+"\t"+spontaneous+"\t"+orphan+"\t"+orphan_in_metacyc+"\n"
+                    reactions.write(to_write_r)
+
+def get_pathways_none_spontaneous_reactions(pgdb):
+    pathways = dict()
+    for path in pgdb.all_pathways(selector='all', base=True):
+        pathways[path] = dict()
+        pathways[path]['Name'] = pgdb[path].common_name
+        pathways[path]['Reactions'] = set()
+        for rxn in pgdb[path].reaction_list:
+            if pgdb[rxn].spontaneous_p != None:
+                 if pgdb[rxn].spontaneous_p != True:
+                     pathways[path]['Reactions'].add(rxn)
+            else:
+                pathways[path]['Reactions'].add(rxn)
+    return(pathways)
+  
+def get_pathways_none_spontaneous_orphan_reactions(pgdb):
+    pathways = dict()
+    for path in pgdb.all_pathways(selector='all', base=True):
+        pathways[path] = dict()
+        pathways[path]['Name'] = pgdb[path].common_name
+        pathways[path]['Reactions'] = set()
+        for rxn in pgdb[path].reaction_list:
+            is_spontaneous = False
+            is_orphan = None
+            is_orphan_in_metacyc = False
+            
+            if pgdb[rxn].spontaneous_p != None and pgdb[rxn].spontaneous_p == True:
+                is_spontaneous = True
+                
+            if is_spontaneous == False:
+                if len(get_genes_of_reaction(rxn, pgdb)) == 0:
+                    is_orphan_in_metacyc = True
+                
+                if pgdb[rxn].orphan_p != None and pgdb[rxn].orphan_p[0] == "|NO|":
+                    is_orphan = False
+                else:
+                    is_orphan = True
+                    
+            if not(is_orphan_in_metacyc and (is_orphan == None or is_orphan)) and not is_spontaneous:
+                pathways[path]['Reactions'].add(rxn)
+                
+    return(pathways)                  
+
+def get_reactions_with_genes(pgdb):
+    reactions = set()
+    for rxn in pgdb.all_rxns(type_of_reactions = ':all'):
+        if len(get_genes_of_reaction(rxn, pgdb)) != 0:
+            reactions.add(rxn)
+    return(reactions)
+
+
+def get_pathways(pgdb):
+    pathways = set()
+    for path in pgdb.all_pathways(selector='all', base=True):
+        pathways.add(path)
+    return(pathways)
+    
+
+def write_pgdb_pathway_completion(pgdb, pgdb_name, use_orphan, completion_dict, position):
+    print(completion_dict)
+    if use_orphan:
+        meta_pathways = get_pathways_none_spontaneous_reactions(pgdb)
+        file_name = pgdb_name+"_pathway_completion.tsv"
+    else:
+        meta_pathways = get_pathways_none_spontaneous_orphan_reactions(pgdb)
+        file_name = pgdb_name+"_pathway_completion_wo_orphan.tsv"
+        
+    pgdb_reactions = get_reactions_with_genes(pgdb)
+    pgdb_pathways = get_pathways(pgdb)
+  
+    with open(file_name, "w") as pgdb_write:
+        header="PGDB\tPathway\tPathway_name\tIs_predicted\tCompletion\n"
+        pgdb_write.write(header)
+        for path in meta_pathways:
+            path_predicted = path in pgdb_pathways
+            if len(meta_pathways[path]['Reactions']) == 0:
+                completion = 0.0
+            else:
+                completion = len(meta_pathways[path]['Reactions'].intersection(pgdb_reactions))/len(meta_pathways[path]['Reactions'])
+            to_write=pgdb_name+"\t"+path.split("|")[1]+"\t"+meta_pathways[path]['Name']+"\t"+str(path_predicted)+"\t"+str(completion)+"\n"
+            pgdb_write.write(to_write)
+            completion_dict[pgdb[path].common_name][position] = (str(completion))
+        return(completion_dict)
+ 
+ 
+ 
+def write_completion_matrix(completion_dict, file_name, header):
+    with open(file_name, "w") as matrix_file:
+        matrix_file.write(header)
+        for pathway in completion_dict:
+            values_to_write = "\t".join(completion_dict[pathway])
+            to_write = pathway+"\t"+values_to_write+"\n"
+            matrix_file.write(to_write)
+        
+  
+def init_completion_dict(pgdb_list, completion_dict, completion_dict_wo_orphan):
+    for p in pgdb_list:
+        name = p.strip()
+        pgdb_name = '|'+name+'|'
+        pgdb = pythoncyc.select_organism(pgdb_name)
+        pgdb_pathways = get_pathways(pgdb)
+        for path in pgdb_pathways:
+            path_name = pgdb[path].common_name
+            if path_name not in completion_dict.keys():
+                completion_dict[path_name] = ["0.0"] * len(pgdb_list) 
+                completion_dict_wo_orphan[path_name] = ["0.0"] * len(pgdb_list)
+    return(completion_dict, completion_dict_wo_orphan)
+  
+                    
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-p", help="Enter the name of the PGDB (example : META)", required=False, type=str)
+    parser.add_argument("-l", help="Enter file with list of PGDB ", required=False, type=str)
+    parser.add_argument("-m", help="Argument for missing value : O or NA - 0 by default", required=False, type=str)
+    args = parser.parse_args()
+    ## write completion in separate files
+    header = "#NAMES"
+    completion_dict = dict()
+    completion_dict_wo_orphan = dict()
+    with open(args.l) as pgdb_file:
+        pgdb_list = [line.strip() for line in pgdb_file]
+    (completion_dict, completion_dict_wo_orphan) = init_completion_dict(pgdb_list, completion_dict, completion_dict_wo_orphan)
+    if args.l:
+        for (name, position) in zip(pgdb_list, range(len(pgdb_list))):
+            name = name.strip()
+            print(name)
+            header = header +"\t"+name
+            print(header)
+            pgdb_name = '|'+name+'|'
+            print(pgdb_name)
+            pgdb = pythoncyc.select_organism(pgdb_name)
+            print(pgdb)
+            #write_reactions_kegg_cross_ref(pgdb)
+            write_pathway(pgdb)
+            completion_dict = write_pgdb_pathway_completion(pgdb, name,  True, completion_dict, position)
+            completion_dict_wo_orphan = write_pgdb_pathway_completion(pgdb, name , False, completion_dict_wo_orphan, position)
+            #print(pgdb._orgid)
+            #close_pgdb_wosaving(pgdb)   
+    elif args.p:
+        pgdb_name = '|'+args.p+'|'
+        pgdb = pythoncyc.select_organism(pgdb_name)
+        write_pathway(pgdb)
+        write_pgdb_pathway_completion(pgdb, meta, args.p, True)
+        write_pgdb_pathway_completion(pgdb, meta, args.p, False)
+    else:
+        sys.exit("Please select an option!")
+    header = header+"\n"
+    write_completion_matrix(completion_dict, "completion_matrix.txt", header)
+    write_completion_matrix(completion_dict_wo_orphan, "completion_matrix_wo_orphan.tsv", header)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/env/ python3
+
+import argparse
+
+
+#usage : python3 create_pf_file.py -m MR_file -k KO_file -o output name and path -f input file name and path
+
+def create_pf_entry(ID, ko_dict, mr_dict, kofam_dict, product_dict):
+    to_add_EC = set()
+    to_add_metacyc = set()
+    to_add_function = set()
+    for i in kofam_dict[ID]:
+        to_add_EC.add("EC    "+i[1])
+        if i[0] in mr_dict.keys():
+            to_add_metacyc.add("METACYC\t"+ ko_dict[mr_dict[i[0]]])
+    if len(product_dict[ID]) == 1 :
+        to_add_function.add("FUNCTION\t"+product_dict[ID][0])
+    else:
+        for p in product_dict[ID]:
+            to_add_function.add("FUNCTION\t"+p)
+    to_write=('ID\t{ID}\n' + 
+              'NAME\t{ID}\n' +
+              'STARTBASE\t1\n' +
+              'ENDBASE\t99\n' +
+              'PRODUCT-TYPE\tP\n' +
+              '{METACYC}\n' +
+              '{FUNCTION}\n'+
+              '{EC}\n' +
+              '//\n').format(ID=ID, EC="\n".join(to_add_EC), METACYC="\n".join(to_add_metacyc), FUNCTION="\n".join(to_add_function) )
+    return(to_write)
+
+def create_KO_MR_dict(KO_file, MR_file):
+    ko_dict= dict()
+    mr_dict = dict()
+    with open(KO_file) as ko_file, open(MR_file) as mr_file:
+        for line in ko_file:
+            l = line.split()
+            ko_dict[l[0]] = l[1].strip()
+        for line in mr_file:
+            l = line.split()
+            mr_dict[l[0]] = l[1].strip()
+    return(ko_dict, mr_dict)
+
+
+def parse_kofam_file(kofamout, ko_dict, output, mr_dict):
+    kofam_dict = dict()
+    product_dict = dict()
+    with open(kofamout) as kofam_file:
+        with open(output, "w") as output_file:
+            for line in kofam_file:
+                if line.startswith("*"):
+                    l = line.split("\t")
+                    if l[2] in mr_dict.keys() and mr_dict[l[2]] in ko_dict.keys():
+                        if l[1] not in kofam_dict.keys():
+                            kofam_dict[l[1]] = list()
+                        if l[1] not in product_dict.keys():
+                            product_dict[l[1]] = list()
+                        if "[" in line:
+                            EC = l[-1].split(":")[-1].split("]")[0]
+                            kofam_dict[l[1]].append((l[2], EC))
+                            product = l[-1].split("[EC")[0].split('"')[1]
+                            product_dict[l[1]].append(product)
+                        else:
+                            kofam_dict[l[1]].append((l[2]))
+                            product = l[-1].split('"')[1]
+                            product_dict[l[1]].append(product)
+            for ID in kofam_dict:
+                entry = create_pf_entry(ID, ko_dict, mr_dict, kofam_dict, product_dict)
+                output_file.write(entry)            
+    
+                        
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "-m", help="Path and name of MR_file)",
+        required=True, type=str)
+    parser.add_argument(
+        "-k", help="Path and name of KO_file",
+        required=True, type=str)
+    parser.add_argument(
+    "-o", help="Path and name for output",
+    required=True, type=str)
+    parser.add_argument(
+    "-f", help="Path and name of kofamout file",
+    required=True, type=str)
+    args = parser.parse_args()
+
+    (ko_dict, mr_dict) = create_KO_MR_dict(args.m, args.k)
+    parse_kofam_file(args.f, ko_dict, args.o, mr_dict)
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,69 @@
+#!/bin/bash
+#SBATCH --job-name=pf_file
+#SBATCH --output pf_file%A_%a.out
+#SBATCH --mem-per-cpu=10240
+#SBATCH --time=03:00:00
+#SBATCH -p normal
+
+
+######################################
+## create pf file for ptools analysis on inti
+## allow multiple file creation in parrallel with the array option of sbatch on slurm
+##
+## commande line example
+##sbatch --array=0-10%10 create_pf_file.sh -m $MR_file -k $KO_file -p $INFOLDER -o $OUTFOLDER
+######################################
+
+
+#ARGUMENTS
+# 1. list of protein prediction using prodigal named *_prodigal_prot.faa
+# 2. Path of input data folder
+# 3. Path of output data folder
+#######################################
+
+##########Initialize variables to default values############
+
+
+UsageInfo () {
+    echo "usage : sbatch --array=0-10%10 create_pf_file.sh -m $MR_file -k $KO_file -p $INFOLDER -o $OUTFOLDER"
+    echo "-o : path of output folder"
+    echo "-k : KO_file"
+    echo "-m : MR_file"
+    echo "-h : print this help"
+}
+
+##################### options #############
+
+options=':h:o:k:m:'
+
+while getopts $options option; do
+  case "$option" in
+	  h) echo "$usage"; exit;;
+	  o) OUTFOLDER=${OPTARG};;
+	  k) KO_file=${OPTARG};;
+	  m) MR_file=${OPTARG};;
+	  :) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
+     \?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
+  esac
+done
+
+
+
+
+##########  Script  ################
+
+INPUTS=($OUTFOLDER/KOFAM/*_kofamout_e001.txt)
+echo ${INPUTS}
+
+echo ${OUTFOLDER}
+echo ${KO_file}
+echo ${MR_file}
+i=${INPUTS[$SLURM_ARRAY_TASK_ID]}
+IDname=$(basename $i _kofamout_e001.txt)
+
+echo ${IDname}
+
+#run python script
+echo "python3 create_pf_file.py -m ${MR_file} -k ${KO_file} -o ${OUTFOLDER}/inputs/${IDname}/${IDname}.pf -f ${INPUTS[$SLURM_ARRAY_TASK_ID]}"
+python3 create_pf_file.py -m ${MR_file} -k ${KO_file} -o ${OUTFOLDER}/inputs/${IDname}/${IDname}.pf -f ${INPUTS[$SLURM_ARRAY_TASK_ID]}
+