-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsilva_mapping.py
286 lines (250 loc) · 9.18 KB
/
silva_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
'''
This was written in an effort to make a TIPP reference out of the SILVA data, unfortunately
that ended in failure because mapping from SILVA to NCBI Taxonomy is not well-defined.
'''
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import re, json, sys, os
import datetime
from multiprocessing import Pool
import subprocess
from ncbi_querying import *
from phylogeny_utilities.utilities import *
file_lookup = get_dict_from_tab_delimited_file('/projects/tallis/nute/data/ncbi-2017/file-list-lookup.txt')
strRE = 'derived from\s+(?P<insdn>[A-Za-z0-9]+)\.'
pat = re.compile(strRE)
file_ids = get_list_from_file('/projects/tallis/nute/data/ncbi-2017/all_cog_uniq_file_ids.txt')
def get_file_CDS_to_INSDN_dict(id):
file_path = file_lookup[id]
si = SeqIO.parse(file_path,'genbank')
cds_to_insdn = {}
for sr in si:
res = pat.search(sr.annotations['comment'])
if res is not None:
acc = res.group('insdn')
else:
acc = None
for feat in sr.features:
if feat.type=='CDS' and 'protein_id' in feat.qualifiers.keys():
k = id + '_pt:' + feat.qualifiers['protein_id'][0]
cds_to_insdn[k] = acc
return cds_to_insdn
def make_cog_silva_taxonid_map(cog):
wd = '/projects/tallis/nute/data/ncbi-2017'
cog_id = cog.replace('.txt','')
silva_tax = os.path.join(wd,'silva_tax','tax_slv_ssu_128.acc_taxid_dedupe_tabd')
std = get_dict_from_tab_delimited_file(silva_tax)
cog_insdn = os.path.join(wd,'COG_insdn_maps',cog)
ci = get_dict_from_tab_delimited_file(cog_insdn)
outmap = open(os.path.join(wd,'COG_silva_maps',cog),'w')
for k in ci.keys():
taxon = None
try:
taxon = std[ci[k]]
except:
taxon = None
outmap.write('%s,%s\n' % (k,taxon))
outmap.close()
# def get_ncbi_assembly_id(aid):
# """
# Writen by Nute on 4/10/18.
# Takes NCBI Accession ID and returns an NCBI Assembly Accession if one exists
#
# Parameters
# -----------
# aid : string
# NCBI Accession ID
#
# Returns
# --------
# assemb_id : string
# NCBI Assembly Accession ID
# """
# p = subprocess.Popen(["curl", "-s",
# "https://www.ncbi.nlm.nih.gov/nuccore/" + aid],
# stdout=subprocess.PIPE)
# (content, error) = p.communicate()
# uidtext = "/assembly\?LinkName=nuccore_assembly&from_uid=(?P<uid>[0-9]+)"
# assembtext = "<p class=\"title\"><a href=\"/assembly/(?P<assid>[A-Za-z0-9\._]+)\""
# uidre = re.compile(uidtext)
# uidmatch = uidre.search(content)
# assembre = re.compile(assembtext)
# assembmatch = assembre.search(content)
# if assembmatch is not None and assembmatch('asssid'):
# return assembmatch.group('assid')
# elif uidmatch is not None and uidmatch.group('uid'):
# uid = uidmatch.group('uid')
# p = subprocess.Popen(["curl", "-s",
# "https://www.ncbi.nlm.nih.gov/assembly?LinkName=nuccore_assembly&from_uid=" + uid],
# stdout=subprocess.PIPE)
# (content2, error) = p.communicate()
# assembmatch2 = assembre.search(content2)
# if assembmatch2 is not None and assembmatch2.group('assid'):
# return assembmatch2.group('assid')
# return None
def get_file_CDS_to_ncbi_taxid_dict(id):
file_path = file_lookup[id]
pa, fi = os.path.split(file_path)
si = SeqIO.parse(file_path,'genbank')
cds_to_ncbi = {}
for sr in si:
acc = None
if sr.features[0].type != 'source':
print ("%s - record: %s does not have a source" % (fi, sr.id))
continue
elif 'db_xref' not in sr.features[0].qualifiers.keys():
print( "%s - record: %s does not have field called db_xref" % (fi, sr.id))
continue
else:
for dx in sr.features[0].qualifiers['db_xref']:
if dx[0:5]=='taxon':
acc = dx.replace('taxon:','')
continue
if acc is None:
print ("%s - record: %s does not have a db_xref entry for taxon" % (fi, sr.id))
continue
for feat in sr.features:
if feat.type=='CDS' and 'protein_id' in feat.qualifiers.keys():
k = id + '_pt:' + feat.qualifiers['protein_id'][0]
cds_to_ncbi[k] = (acc,sr.name)
return cds_to_ncbi
def get_all_files_in_list(file_list):
my_ids = get_list_from_file(file_list + '.txt')
all_dict = {}
# ct = 0
for i in my_ids:
# ct +=1
# all_dict[i]=get_file_CDS_to_INSDN_dict(i)
all_dict[i] = get_file_CDS_to_ncbi_taxid_dict(i)
# if ct % 25 == 0:
# print "%s done" % ct
# outf = open(file_list + '.json','w')
outf = open(file_list + '_ncbi.json', 'w')
json.dump(all_dict,outf)
outf.close()
print ('%s done' % file_list)
# def get_full_cds_dict():
# str_name = '/projects/tallis/nute/data/ncbi-2017/partial_cog_uniq/all_cog_uniq_file_ids_'
#
# full = {}
#
# for i in range(320):
# print str(i)
# # myf = open(str_name + str(i) + '.json', 'r')
# myf = open(str_name + str(i) + '_ncbi.json','r')
# a = json.load(myf)
# myf.close()
# full.update(a)
# del a
#
# # js = open('/projects/tallis/nute/data/ncbi-2017/full_cds_dict.json','w')
# js = open('/projects/tallis/nute/data/ncbi-2017/full_cds_dict_ncbi.json', 'w')
# json.dump(full,js)
# js.close()
# return full
def get_full_cds_dict():
# pa = '/projects/tallis/nute/data/ncbi-2017/full_cds_dict.json'
pa = '/projects/tallis/nute/data/ncbi-2017/full_cds_dict_ncbi.json'
myf = open(pa,'r')
full = json.load(myf)
myf.close()
# print "loaded full_cds_dict.json"
print ("loaded full_cds_dict_ncbi.json")
return full
def make_cog_insdn_map(cog_file, insdn_lookup):
fas = read_from_fasta(cog_file)
out_file = cog_file.replace('.faa','.txt')
out_file = out_file.replace('COGS','COG_insdn_maps')
of = open(out_file, 'w')
for i in fas.keys():
a = i.strip().split('_')
fi_ind = a[0]
try:
insdn = insdn_lookup[fi_ind][i]
except:
insdn = '(Not Found)'
of.write('%s\t%s\n' % (i,insdn))
of.close()
def make_cog_ncbi_map(cog_file, ncbi_lookup):
fas = read_from_fasta(cog_file)
out_file = cog_file.replace('.faa','.txt')
out_file = out_file.replace('COGS','COG_ncbi_maps')
of = open(out_file, 'w')
for i in fas.keys():
a = i.strip().split('_')
fi_ind = a[0]
try:
ncbi_acc = ncbi_lookup[fi_ind][i][1]
taxon = ncbi_lookup[fi_ind][i][0]
except:
ncbi_acc = '(Not Found)'
taxon = '(Not Found)'
# of.write('%s,%s,%s\n' % (i,ncbi_acc,taxon))
of.write('%s\t%s\n' % (i,taxon))
of.close()
def make_file_seq_accn_taxid_lookup():
'''
Added by Nute on 4/10/2018 to figure out which accessions map to which taxa
:return:
'''
ncbi_lkp = get_full_cds_dict()
outf = open('/projects/tallis/nute/data/ncbi-2017/file_seq_accn_taxid.txt','w')
outf.write('file_id\tseqname\tncbi_tax_id\tncbi_accn_id\n')
fct = 0
lct = 0
for fid in ncbi_lkp.keys():
for seqid in ncbi_lkp[fid].keys():
tax_accn = ncbi_lkp[fid][seqid]
outf.write('%s\t%s\t%s\t%s\n' % (fid, seqid, tax_accn[0], tax_accn[1]))
lct +=1
fct +=1
if fct % 1000 == 0:
print (fct)
outf.close()
print ('wrote %s lines' % lct)
def make_accn_taxid_lookup():
'''
Added by Nute on 4/10/2018 to figure out which accessions map to which taxa
:return:
'''
ncbi_lkp = get_full_cds_dict()
outf = open('/projects/tallis/nute/data/ncbi-2017/accn_taxid_lookup_2.txt','w')
outf.write('ncbi_accession_id\tnum_taxon_ids\tncbi_taxon_id\n')
accns = {}
fct = 0
for fid in ncbi_lkp.keys():
for seqid in ncbi_lkp[fid].keys():
tax_accn = ncbi_lkp[fid][seqid]
if tax_accn[1] not in accns.keys():
accns[tax_accn[1]] = set([tax_accn[0]])
else:
accns[tax_accn[1]].add(tax_accn[0])
fct +=1
if fct % 1000 ==0:
print ('file ct:\t%s' % fct)
print ('final step ....')
accnct = 0
for i in accns.keys():
outf.write('%s,%s,%s\n' % (i, len(foo), ','.join(list(accns[i]))))
accnct+=1
if accnct % 20000 == 0:
print ('accn ct:\t%s' % accnct)
outf.close()
if __name__=='__main__':
# get_all_files()
# print datetime.datetime.now()
# ncbi_lkp = get_full_cds_dict()
# print datetime.datetime.now()
#
# for line in sys.stdin:
# print line.strip()
# make_cog_insdn_map(line.strip(), insdn_lkp)
# make_cog_ncbi_map(line.strip(), ncbi_lkp)
# make_cog_silva_taxonid_map(line.strip())
# p = Pool(16)
# l = []
# for i in range(320):
# l.append('/projects/tallis/nute/data/ncbi-2017/partial_cog_uniq/all_cog_uniq_file_ids_' + str(i))
# p.map(get_all_files_in_list,l)
# make_file_seq_accn_taxid_lookup()
make_accn_taxid_lookup()