Skip to content

Commit e1cffad

Browse files
authored
Merge pull request #1 from labgem/dev-1
merge dev-1 branch to main
2 parents c64da12 + 76058e4 commit e1cffad

8 files changed

+21694
-0
lines changed

create_completion_matrix.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
# -*- coding: utf-8 -*-
2+
#!/usr/bin/env/ python3
3+
4+
5+
import os
6+
import sys
7+
import pythoncyc
8+
import argparse
9+
import subprocess
10+
from pythoncyc import PTools as PTools
11+
from pythoncyc.PTools import PToolsError as PToolsError
12+
from pythoncyc.PTools import PythonCycError as PythonCycError
13+
14+
#Usage : python3 dev_pythoncyc.py -p META
15+
16+
17+
def close_pgdb_wosaving(pgdb):
18+
try:
19+
r = PTools.sendQueryToPTools('(close-kb :kb (kb-of-organism "'+pgdb._orgid+'") :save-updates-p nil)')
20+
except PToolsError as msg:
21+
raise PythonCycError('Pathway Tools was unable to close KB of organism (orgid) {orgid}. More specifically: {msg}'.format(orgid=pgdb._orgid, msg=msg))
22+
return(r)
23+
24+
25+
def get_genes_of_reaction(rxn, pgdb):
26+
genes = set()
27+
for gene in pgdb.genes_of_reaction(rxn):
28+
genes.add(gene)
29+
for enz in pgdb.enzymes_of_reaction(rxn):
30+
for gene in pgdb.genes_of_protein(enz):
31+
genes.add(gene)
32+
return(genes)
33+
34+
35+
def write_pathway(pgdb):
36+
with open("metacyc_pathways.tsv", "w") as pathways:
37+
with open("metacyc_reactions_by_pathway.tsv", "w") as reactions:
38+
header1="Pathway_Id\tCommon_name\n"
39+
pathways.write(header1)
40+
header2="Pathway_Id\tReaction_Id\tSpontaneous\tOrphan\tOrphan_in_Metacyc\n"
41+
reactions.write(header2)
42+
for path in pgdb.all_pathways(selector='all', base=True):
43+
to_write=path.split("|")[1]+"\t"+pgdb[path].common_name+"\n"
44+
pathways.write(to_write)
45+
for rxn in pgdb[path].reaction_list:
46+
orphan_in_metacyc = "FALSE"
47+
orphan = "NA"
48+
spontaneous = "FALSE"
49+
if pgdb[rxn].spontaneous_p != None and pgdb[rxn].spontaneous_p == True:
50+
spontaneous = "TRUE"
51+
if len(get_genes_of_reaction(rxn, pgdb)) == 0 and spontaneous == "FALSE":
52+
orphan_in_metacyc = "TRUE"
53+
if pgdb[rxn].orphan_p != None:
54+
if pgdb[rxn].orphan_p[0] == "|NO|":
55+
orphan = "FALSE"
56+
else:
57+
orphan = "TRUE"
58+
to_write_r = path.split("|")[1]+"\t"+rxn.split("|")[1]+"\t"+spontaneous+"\t"+orphan+"\t"+orphan_in_metacyc+"\n"
59+
reactions.write(to_write_r)
60+
61+
def get_pathways_none_spontaneous_reactions(pgdb):
62+
pathways = dict()
63+
for path in pgdb.all_pathways(selector='all', base=True):
64+
pathways[path] = dict()
65+
pathways[path]['Name'] = pgdb[path].common_name
66+
pathways[path]['Reactions'] = set()
67+
for rxn in pgdb[path].reaction_list:
68+
if pgdb[rxn].spontaneous_p != None:
69+
if pgdb[rxn].spontaneous_p != True:
70+
pathways[path]['Reactions'].add(rxn)
71+
else:
72+
pathways[path]['Reactions'].add(rxn)
73+
return(pathways)
74+
75+
def get_pathways_none_spontaneous_orphan_reactions(pgdb):
76+
pathways = dict()
77+
for path in pgdb.all_pathways(selector='all', base=True):
78+
pathways[path] = dict()
79+
pathways[path]['Name'] = pgdb[path].common_name
80+
pathways[path]['Reactions'] = set()
81+
for rxn in pgdb[path].reaction_list:
82+
is_spontaneous = False
83+
is_orphan = None
84+
is_orphan_in_metacyc = False
85+
86+
if pgdb[rxn].spontaneous_p != None and pgdb[rxn].spontaneous_p == True:
87+
is_spontaneous = True
88+
89+
if is_spontaneous == False:
90+
if len(get_genes_of_reaction(rxn, pgdb)) == 0:
91+
is_orphan_in_metacyc = True
92+
93+
if pgdb[rxn].orphan_p != None and pgdb[rxn].orphan_p[0] == "|NO|":
94+
is_orphan = False
95+
else:
96+
is_orphan = True
97+
98+
if not(is_orphan_in_metacyc and (is_orphan == None or is_orphan)) and not is_spontaneous:
99+
pathways[path]['Reactions'].add(rxn)
100+
101+
return(pathways)
102+
103+
def get_reactions_with_genes(pgdb):
104+
reactions = set()
105+
for rxn in pgdb.all_rxns(type_of_reactions = ':all'):
106+
if len(get_genes_of_reaction(rxn, pgdb)) != 0:
107+
reactions.add(rxn)
108+
return(reactions)
109+
110+
111+
def get_pathways(pgdb):
112+
pathways = set()
113+
for path in pgdb.all_pathways(selector='all', base=True):
114+
pathways.add(path)
115+
return(pathways)
116+
117+
118+
def write_pgdb_pathway_completion(pgdb, pgdb_name, use_orphan, completion_dict, position):
119+
print(completion_dict)
120+
if use_orphan:
121+
meta_pathways = get_pathways_none_spontaneous_reactions(pgdb)
122+
file_name = pgdb_name+"_pathway_completion.tsv"
123+
else:
124+
meta_pathways = get_pathways_none_spontaneous_orphan_reactions(pgdb)
125+
file_name = pgdb_name+"_pathway_completion_wo_orphan.tsv"
126+
127+
pgdb_reactions = get_reactions_with_genes(pgdb)
128+
pgdb_pathways = get_pathways(pgdb)
129+
130+
with open(file_name, "w") as pgdb_write:
131+
header="PGDB\tPathway\tPathway_name\tIs_predicted\tCompletion\n"
132+
pgdb_write.write(header)
133+
for path in meta_pathways:
134+
path_predicted = path in pgdb_pathways
135+
if len(meta_pathways[path]['Reactions']) == 0:
136+
completion = 0.0
137+
else:
138+
completion = len(meta_pathways[path]['Reactions'].intersection(pgdb_reactions))/len(meta_pathways[path]['Reactions'])
139+
to_write=pgdb_name+"\t"+path.split("|")[1]+"\t"+meta_pathways[path]['Name']+"\t"+str(path_predicted)+"\t"+str(completion)+"\n"
140+
pgdb_write.write(to_write)
141+
completion_dict[pgdb[path].common_name][position] = (str(completion))
142+
return(completion_dict)
143+
144+
145+
146+
def write_completion_matrix(completion_dict, file_name, header):
147+
with open(file_name, "w") as matrix_file:
148+
matrix_file.write(header)
149+
for pathway in completion_dict:
150+
values_to_write = "\t".join(completion_dict[pathway])
151+
to_write = pathway+"\t"+values_to_write+"\n"
152+
matrix_file.write(to_write)
153+
154+
155+
def init_completion_dict(pgdb_list, completion_dict, completion_dict_wo_orphan):
156+
for p in pgdb_list:
157+
name = p.strip()
158+
pgdb_name = '|'+name+'|'
159+
pgdb = pythoncyc.select_organism(pgdb_name)
160+
pgdb_pathways = get_pathways(pgdb)
161+
for path in pgdb_pathways:
162+
path_name = pgdb[path].common_name
163+
if path_name not in completion_dict.keys():
164+
completion_dict[path_name] = ["0.0"] * len(pgdb_list)
165+
completion_dict_wo_orphan[path_name] = ["0.0"] * len(pgdb_list)
166+
return(completion_dict, completion_dict_wo_orphan)
167+
168+
169+
def main():
170+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
171+
parser.add_argument("-p", help="Enter the name of the PGDB (example : META)", required=False, type=str)
172+
parser.add_argument("-l", help="Enter file with list of PGDB ", required=False, type=str)
173+
parser.add_argument("-m", help="Argument for missing value : O or NA - 0 by default", required=False, type=str)
174+
args = parser.parse_args()
175+
## write completion in separate files
176+
header = "#NAMES"
177+
completion_dict = dict()
178+
completion_dict_wo_orphan = dict()
179+
with open(args.l) as pgdb_file:
180+
pgdb_list = [line.strip() for line in pgdb_file]
181+
(completion_dict, completion_dict_wo_orphan) = init_completion_dict(pgdb_list, completion_dict, completion_dict_wo_orphan)
182+
if args.l:
183+
for (name, position) in zip(pgdb_list, range(len(pgdb_list))):
184+
name = name.strip()
185+
print(name)
186+
header = header +"\t"+name
187+
print(header)
188+
pgdb_name = '|'+name+'|'
189+
print(pgdb_name)
190+
pgdb = pythoncyc.select_organism(pgdb_name)
191+
print(pgdb)
192+
#write_reactions_kegg_cross_ref(pgdb)
193+
write_pathway(pgdb)
194+
completion_dict = write_pgdb_pathway_completion(pgdb, name, True, completion_dict, position)
195+
completion_dict_wo_orphan = write_pgdb_pathway_completion(pgdb, name , False, completion_dict_wo_orphan, position)
196+
#print(pgdb._orgid)
197+
#close_pgdb_wosaving(pgdb)
198+
elif args.p:
199+
pgdb_name = '|'+args.p+'|'
200+
pgdb = pythoncyc.select_organism(pgdb_name)
201+
write_pathway(pgdb)
202+
write_pgdb_pathway_completion(pgdb, meta, args.p, True)
203+
write_pgdb_pathway_completion(pgdb, meta, args.p, False)
204+
else:
205+
sys.exit("Please select an option!")
206+
header = header+"\n"
207+
write_completion_matrix(completion_dict, "completion_matrix.txt", header)
208+
write_completion_matrix(completion_dict_wo_orphan, "completion_matrix_wo_orphan.tsv", header)
209+
210+
if __name__ == "__main__":
211+
main()

create_pf_file.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# -*- coding: utf-8 -*-
2+
#!/usr/bin/env/ python3
3+
4+
import argparse
5+
6+
7+
#usage : python3 create_pf_file.py -m MR_file -k KO_file -o output name and path -f input file name and path
8+
9+
def create_pf_entry(ID, ko_dict, mr_dict, kofam_dict, product_dict):
10+
to_add_EC = set()
11+
to_add_metacyc = set()
12+
to_add_function = set()
13+
for i in kofam_dict[ID]:
14+
to_add_EC.add("EC "+i[1])
15+
if i[0] in mr_dict.keys():
16+
to_add_metacyc.add("METACYC\t"+ ko_dict[mr_dict[i[0]]])
17+
if len(product_dict[ID]) == 1 :
18+
to_add_function.add("FUNCTION\t"+product_dict[ID][0])
19+
else:
20+
for p in product_dict[ID]:
21+
to_add_function.add("FUNCTION\t"+p)
22+
to_write=('ID\t{ID}\n' +
23+
'NAME\t{ID}\n' +
24+
'STARTBASE\t1\n' +
25+
'ENDBASE\t99\n' +
26+
'PRODUCT-TYPE\tP\n' +
27+
'{METACYC}\n' +
28+
'{FUNCTION}\n'+
29+
'{EC}\n' +
30+
'//\n').format(ID=ID, EC="\n".join(to_add_EC), METACYC="\n".join(to_add_metacyc), FUNCTION="\n".join(to_add_function) )
31+
return(to_write)
32+
33+
def create_KO_MR_dict(KO_file, MR_file):
34+
ko_dict= dict()
35+
mr_dict = dict()
36+
with open(KO_file) as ko_file, open(MR_file) as mr_file:
37+
for line in ko_file:
38+
l = line.split()
39+
ko_dict[l[0]] = l[1].strip()
40+
for line in mr_file:
41+
l = line.split()
42+
mr_dict[l[0]] = l[1].strip()
43+
return(ko_dict, mr_dict)
44+
45+
46+
def parse_kofam_file(kofamout, ko_dict, output, mr_dict):
47+
kofam_dict = dict()
48+
product_dict = dict()
49+
with open(kofamout) as kofam_file:
50+
with open(output, "w") as output_file:
51+
for line in kofam_file:
52+
if line.startswith("*"):
53+
l = line.split("\t")
54+
if l[2] in mr_dict.keys() and mr_dict[l[2]] in ko_dict.keys():
55+
if l[1] not in kofam_dict.keys():
56+
kofam_dict[l[1]] = list()
57+
if l[1] not in product_dict.keys():
58+
product_dict[l[1]] = list()
59+
if "[" in line:
60+
EC = l[-1].split(":")[-1].split("]")[0]
61+
kofam_dict[l[1]].append((l[2], EC))
62+
product = l[-1].split("[EC")[0].split('"')[1]
63+
product_dict[l[1]].append(product)
64+
else:
65+
kofam_dict[l[1]].append((l[2]))
66+
product = l[-1].split('"')[1]
67+
product_dict[l[1]].append(product)
68+
for ID in kofam_dict:
69+
entry = create_pf_entry(ID, ko_dict, mr_dict, kofam_dict, product_dict)
70+
output_file.write(entry)
71+
72+
73+
def main():
74+
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
75+
parser.add_argument(
76+
"-m", help="Path and name of MR_file)",
77+
required=True, type=str)
78+
parser.add_argument(
79+
"-k", help="Path and name of KO_file",
80+
required=True, type=str)
81+
parser.add_argument(
82+
"-o", help="Path and name for output",
83+
required=True, type=str)
84+
parser.add_argument(
85+
"-f", help="Path and name of kofamout file",
86+
required=True, type=str)
87+
args = parser.parse_args()
88+
89+
(ko_dict, mr_dict) = create_KO_MR_dict(args.m, args.k)
90+
parse_kofam_file(args.f, ko_dict, args.o, mr_dict)
91+
92+
if __name__ == "__main__":
93+
main()
94+

create_pf_file.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=pf_file
3+
#SBATCH --output pf_file%A_%a.out
4+
#SBATCH --mem-per-cpu=10240
5+
#SBATCH --time=03:00:00
6+
#SBATCH -p normal
7+
8+
9+
######################################
10+
## create pf file for ptools analysis on inti
11+
## allow multiple file creation in parrallel with the array option of sbatch on slurm
12+
##
13+
## commande line example
14+
##sbatch --array=0-10%10 create_pf_file.sh -m $MR_file -k $KO_file -p $INFOLDER -o $OUTFOLDER
15+
######################################
16+
17+
18+
#ARGUMENTS
19+
# 1. list of protein prediction using prodigal named *_prodigal_prot.faa
20+
# 2. Path of input data folder
21+
# 3. Path of output data folder
22+
#######################################
23+
24+
##########Initialize variables to default values############
25+
26+
27+
UsageInfo () {
28+
echo "usage : sbatch --array=0-10%10 create_pf_file.sh -m $MR_file -k $KO_file -p $INFOLDER -o $OUTFOLDER"
29+
echo "-o : path of output folder"
30+
echo "-k : KO_file"
31+
echo "-m : MR_file"
32+
echo "-h : print this help"
33+
}
34+
35+
##################### options #############
36+
37+
options=':h:o:k:m:'
38+
39+
while getopts $options option; do
40+
case "$option" in
41+
h) echo "$usage"; exit;;
42+
o) OUTFOLDER=${OPTARG};;
43+
k) KO_file=${OPTARG};;
44+
m) MR_file=${OPTARG};;
45+
:) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
46+
\?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
47+
esac
48+
done
49+
50+
51+
52+
53+
########## Script ################
54+
55+
INPUTS=($OUTFOLDER/KOFAM/*_kofamout_e001.txt)
56+
echo ${INPUTS}
57+
58+
echo ${OUTFOLDER}
59+
echo ${KO_file}
60+
echo ${MR_file}
61+
i=${INPUTS[$SLURM_ARRAY_TASK_ID]}
62+
IDname=$(basename $i _kofamout_e001.txt)
63+
64+
echo ${IDname}
65+
66+
#run python script
67+
echo "python3 create_pf_file.py -m ${MR_file} -k ${KO_file} -o ${OUTFOLDER}/inputs/${IDname}/${IDname}.pf -f ${INPUTS[$SLURM_ARRAY_TASK_ID]}"
68+
python3 create_pf_file.py -m ${MR_file} -k ${KO_file} -o ${OUTFOLDER}/inputs/${IDname}/${IDname}.pf -f ${INPUTS[$SLURM_ARRAY_TASK_ID]}
69+

0 commit comments

Comments
 (0)