-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_reaction.py
236 lines (219 loc) · 12.2 KB
/
create_reaction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import collections
from Bio import SeqIO
import pickle
import cobra
import pickle
from modelAnalysis.info import aaDict, k_atp_protein
def get_tuple(line, sep="\t"):
return line.rstrip().split(sep)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tissue",
help="name of the tissue (in the protein atlas rna expression file)")
args = parser.parse_args()
rna_expression_file = "{}_rna_expression.tsv".format(args.tissue)
uniprot_ensembl_mapping_file = "uniprot_to_ensembl.dat"
human_proteome_swissprot_file = "UP000005640.xml"
human_proteome_trembl_file = "UP000005640_unreviewed.xml"
human_proteome_seq_dict = "human_proteome_sequences.pickle"
complete_uniprot_ensembl_file = "complete_uniprot_to_ensembl.pickle"
weighted_aminoacids_count_file = "weighted_aminoacid_count_{}.pickle".format(args.tissue)
met_dict_file = "recon2_met_dict.txt"
model_file = "recon2_fixed.xml"
protein_reaction_id = "biomass_protein"
protein_reaction_name = "protein component of biomass (based on RNA expression in {})".format(args.tissue)
new_biomass_protein_pickle = "biomass_protein_{}.pickle".format(args.tissue)
tissue_specific_model = "recon2_{}.xml".format(args.tissue)
# Store tissue rna expression in a dictionary mapping ensembl id to normalized
# expression level
with open(rna_expression_file, "r") as fh:
tuples = (get_tuple(line) for line in fh)
ensembl_expression = dict()
for ensembl_id, gene_name, expression_level in tuples:
ensembl_expression[ensembl_id] = float(expression_level)
# Read the human proteome file
# NOTE: at the time this script is written, the "swiss"-formatted file
# reading through SeqIO.parse is broken.
# The dictionary is pickled because it takes quite some time to generate
try:
with open(human_proteome_seq_dict, "rb") as fh:
sequences = pickle.load(fh)
except:
print("Generating the sequences dict")
sequences = dict() # uniprot id to aminoacid sequence
print("Reading Swissprot data")
with open(human_proteome_swissprot_file, "r") as fh:
human_proteome_swissprot = SeqIO.UniprotIO.UniprotIterator(fh)
for record in human_proteome_swissprot:
for accession_id in record.annotations["accessions"]:
sequences[accession_id] = record.seq
print("Reading TrEMBL data")
with open(human_proteome_trembl_file, "r") as fh:
human_proteome_trembl = SeqIO.UniprotIO.UniprotIterator(fh)
for record in human_proteome_trembl:
for accession_id in record.annotations["accessions"]:
if accession_id not in sequences:
sequences[accession_id] = record.seq
with open(human_proteome_seq_dict, "wb") as fh:
print("saving it to {}".format(human_proteome_seq_dict))
pickle.dump(sequences, fh)
print("done")
try:
with open(complete_uniprot_ensembl_file, "rb") as fh:
ensembl_to_uniprot = pickle.load(fh)
except:
print("Generating ensemble-to-uniprot id dictionary")
# Store ensembl id to uniprot ids (one ensembl id actually points to
# potentially multiple uniprot ids)
ensembl_to_uniprot = collections.defaultdict(list)
with open(uniprot_ensembl_mapping_file, "r") as fh:
tuples = (get_tuple(line) for line in fh)
for uniprot_id, ensembl_id in tuples:
ensembl_to_uniprot[ensembl_id].append(uniprot_id)
# Store supplementary ensembl_id -> uniprot id by looking at OpenTargets and
# Bgee crossrefs
print("Adding supplementary crossreferences")
print("Reading from Swissprot")
with open(human_proteome_swissprot_file, "r") as fh:
human_proteome = SeqIO.UniprotIO.UniprotIterator(fh)
for record in human_proteome:
opentargets_crossref = [crossref for crossref in record.dbxrefs
if crossref.startswith("OpenTargets:")]
bgee_crossref = [crossref for crossref in record.dbxrefs
if crossref.startswith("Bgee:")]
if opentargets_crossref:
dbname, ensembl_id = opentargets_crossref[0].split(":")
ensembl_to_uniprot[ensembl_id].extend(record.annotations["accessions"])
if bgee_crossref:
dbname, ensembl_id = bgee_crossref[0].split(":")
ensembl_to_uniprot[ensembl_id].extend(record.annotations["accessions"])
print("Reading from TrEMBL")
with open(human_proteome_trembl_file, "r") as fh:
human_proteome = SeqIO.UniprotIO.UniprotIterator(fh)
for record in human_proteome:
opentargets_crossref = [crossref for crossref in record.dbxrefs
if crossref.startswith("OpenTargets:")]
bgee_crossref = [crossref for crossref in record.dbxrefs
if crossref.startswith("Bgee:")]
if opentargets_crossref:
dbname, ensembl_id = opentargets_crossref[0].split(":")
ensembl_to_uniprot[ensembl_id].extend(record.annotations["accessions"])
if bgee_crossref:
dbname, ensembl_id = bgee_crossref[0].split(":")
ensembl_to_uniprot[ensembl_id].extend(record.annotations["accessions"])
with open(complete_uniprot_ensembl_file, "wb") as fh:
print("saving it to {}".format(complete_uniprot_ensembl_file))
pickle.dump(ensembl_to_uniprot, fh)
print("done")
# For every ensembl id in the tissue expression levels data,
# get it's protein sequence from uniprot
print("Compute sum of aminoacids in proteome weighted by RNA expression")
total_aminoacid_counter = collections.Counter()
peptide_bonds = 0 # length of proteins weighted their by RNA expression
nbof_rna = len(ensembl_expression)
for i, expression_data in enumerate(ensembl_expression.items()):
ensembl_id, expression = expression_data
uniprot_ids = ensembl_to_uniprot[ensembl_id]
# ensembl_id not associated to any uniprot id may exist in uniprot but are
# not included in the proteome data probably because they are evidenced
# at transcript level but not at protein level. Those putative proteins
# are skipped from the aminoacid count
if uniprot_ids:
print("{} -> {}: {}/{}".format(ensembl_id, "; ".join(uniprot_ids), i, nbof_rna))
valid_uniprot_id = next(uniprot_id for uniprot_id in uniprot_ids
if uniprot_id in sequences)
sequence = sequences[valid_uniprot_id]
aminoacids_count = collections.Counter(sequence)
weighted_aminoacid_count = {aa: count * expression for aa, count in aminoacids_count.items()}
total_aminoacid_counter += weighted_aminoacid_count
peptide_bonds += len(sequence) * expression
else:
print("{}: no protein information: skipped".format(ensembl_id))
print("done")
print("saving it to {}".format(weighted_aminoacids_count_file))
with open(weighted_aminoacids_count_file, "wb") as fh:
pickle.dump({"aminoacid count": total_aminoacid_counter,
"peptide bonds": peptide_bonds}, fh)
print("Generate the biomass reaction")
# Read the metabolites dictionary
with open(met_dict_file, "r") as fh:
met_dict = dict()
next(fh)
next(fh)
for line in fh:
key, ID, name, chebi_ID = line.split(",")
met_dict[key] = ID.lstrip()
# Normalize the aminoacid count.
# The reaction's stoichiometry is multiplied by a scaling factor such that the
# mass of aminoacid being consummed is the same as in the original reaction.
print("Reading the model")
model = cobra.io.read_sbml_model(model_file)
print("Normalizing the stoichiometry")
original_protein_reaction = model.reactions.get_by_id("biomass_protein")
# Create a reverse metabolite dictionary which maps the model's ids to their
# generic names (A, G, H, P...)
reverse_met_dict = {value: key for key, value in met_dict.items()}
# Compute the molecular weight of the aminoacids in the default protein biomass
# reaction in the model (molecular weight of individual aminoacids weighted by
# their stoichiometric coefficient)
# NOTE: the conditional of this list comprehension excludes reagents of the
# biomass reaction which are not aminoacids (biomass, atp, h2o...)
original_protein_reaction_weight = sum([-coef * aaDict[reverse_met_dict[met.id]]
for met, coef
in original_protein_reaction.metabolites.items()
if met.id in reverse_met_dict
and reverse_met_dict[met.id] in aaDict])
original_total_aa_stoich = sum(-coef for met, coef in original_protein_reaction.metabolites.items()
if met.id in reverse_met_dict
and reverse_met_dict[met.id] in aaDict)
original_atp_stoich = next(coef for met, coef in original_protein_reaction.metabolites.items() if met.id == met_dict["atp"])
print("cost per aminoacid in original reaction: {} atp/aa".format(original_atp_stoich / original_total_aa_stoich))
# Compute the molecular weight of the aminoacids in the total aminoacid count
# from RNA
# NOTE: "X" corresponds to unidentified aminoacids in uniprot.
# NOTE: "U" corresponds to pyrrolidone carboxylic acid, which does not exist in
# the recon2 model.
weight_from_count = sum(count * aaDict[aa_id]
for aa_id, count
in total_aminoacid_counter.items()
if aa_id != "X" and aa_id != "U")
scaling_factor = original_protein_reaction_weight / weight_from_count
reaction_stoichiometry = {aa: -count for aa, count in total_aminoacid_counter.items()}
# Add ATP, ADP and water to the stoichiometry
virus_protein_atp_cost = peptide_bonds * k_atp_protein
reaction_stoichiometry["atp"] = -virus_protein_atp_cost
reaction_stoichiometry["h2o"] = -virus_protein_atp_cost
reaction_stoichiometry["adp"] = virus_protein_atp_cost
reaction_stoichiometry["Pi"] = virus_protein_atp_cost
reaction_stoichiometry["h"] = virus_protein_atp_cost
normalized_count = {key: value * scaling_factor
for key, value in reaction_stoichiometry.items()
if key != "X" and key != "U"}
# DEBUG
print("new protein reaction:")
print({met: coef for met, coef in normalized_count.items()})
print("cost per aminoacid in new reaction:")
print(virus_protein_atp_cost / -sum(coef for met, coef in reaction_stoichiometry.items()
if met in aaDict))
# Turn the count dictionary into an input suitable for a
# cobra.core.reaction.Reaction instance.
print("Instanciating the reaction")
metabolites_dict = {model.metabolites.get_by_id(met_dict[key]): value
for key, value in normalized_count.items()}
protein_metabolite = model.metabolites.get_by_id("biomass_protein_c")
metabolites_dict[protein_metabolite] = 1
# Create the new biomass_protein function
biomass_protein = cobra.core.reaction.Reaction(id=protein_reaction_id,
name=protein_reaction_name)
biomass_protein.add_metabolites(metabolites_dict)
print(biomass_protein)
print("Saving the new biomass protein reaction as {}".format(new_biomass_protein_pickle))
with open(new_biomass_protein_pickle, "wb") as fh:
pickle.dump(biomass_protein, fh)
# Create the updated model
original_protein_reaction_index = next(i for i, reaction in enumerate(model.reactions)
if reaction.id == original_protein_reaction.id)
model.reactions[original_protein_reaction_index] = biomass_protein
print("Saving the new model as {}".format(tissue_specific_model))
cobra.io.write_sbml_model(model, tissue_specific_model)