-
Notifications
You must be signed in to change notification settings - Fork 0
/
otherutils
180 lines (151 loc) · 6.81 KB
/
otherutils
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""methods from the otherutils module in OrthoEvol"""
class BlastUtils(object):
def __init__(self):
"""
Various utilities to help with blast specific functionality.
"""
pass
def seqid_list_config(self, seqid_list_path, taxonomy_ids, research_path=None, config=False):
"""Create a seqid list based on the refseq_rna database for each taxonomy id.
It will also convert the gi list into a binary file which is more
efficient to use with NCBI's Standalone Blast tools.
"""
warn("NCBI has deprecated using GI numbers.", OrthoEvolDeprecationWarning)
if config:
# Directory and file handling
raw_data_path = research_path / Path('raw_data')
index_path = research_path / Path('index')
taxid_file = index_path / Path('taxids.csv')
pd.Series(taxonomy_ids).to_csv(str(taxid_file), index=False)
# TODO Rework this
self.create_seqid_lists(seqid_list_path=raw_data_path, taxonomy_ids=taxonomy_ids)
else:
self.create_seqid_lists(seqid_list_path=seqid_list_path, taxonomy_ids=taxonomy_ids)
def create_seqid_lists(self, seqid_list_path, taxonomy_ids):
"""Use the blastdbcmd tool to generate seqid lists.
It then uses the blastdb_aliastool to turn the list into a binary file.
The input (id) for the function is a taxonomy id.
"""
warn("NCBI has deprecated using GI numbers.", OrthoEvolDeprecationWarning)
if os.path.exists(str(seqid_list_path)):
os.chdir(str(seqid_list_path))
# Use the accession #'s and the blastdbcmd tool to generate gi lists
# based on Organisms/Taxonomy id's.
# TODO Create blastdbcmd commandline tools
gi_time_secs = time.time()
with ThreadPool(3) as gilist_pool:
gilist_pool.map(self._taxid2seqidlist, taxonomy_ids)
minutes = (time.time() - gi_time_secs) / 60
seqidlist_log.info("Took %s minutes to create gi binary files." % minutes)
def _taxid2seqidlist(self, taxonomy_id):
"""Use a taxonomy id in order to get the list of GI numbers."""
warn("NCBI has deprecated using GI numbers.", OrthoEvolDeprecationWarning)
tid = str(taxonomy_id)
binary = tid + 'gi'
if binary not in os.listdir():
if platform.system() == 'Linux':
# TODO Convert to subprocess
# TODO Test this on Linux
runcmd(
"blastdbcmd -db refseq_rna -entry all -outfmt '%g %a' | awk ' { if ($2 == " + tid + ") { print $1 } } ' > " + tid + "gi.txt")
seqidlist_log.info(tid + "gi.txt has been created.")
# Convert the .txt file to a binary file using the blastdb_aliastool
runcmd("blastdb_aliastool -gi_file_in " + tid + "gi.txt -gi_file_out " + tid + "gi")
seqidlist_log.info(tid + "gi binary file has been created.")
# Remove the gi.text file
os.remove(tid + "gi.txt")
seqidlist_log.info(tid + "gi.text file has been deleted.")
else:
raise NotImplementedError(platform.system() + 'is not supported')
else:
seqidlist_log.info('%s already exists' % str(binary))
class OtherUtils(object):
def __init__(self):
"""
Various utilities to help with non-specific functionality.
"""
pass
def runcmd(self, command_string):
"""Run a command string.
:param command string:
"""
try:
cmd = [command_string] # this is the command
# Shell MUST be True
cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
except CalledProcessError as err:
errmsg = err.stderr.decode('utf-8')
return errmsg
else:
if cmd_status.returncode == 0: # Command was successful.
# The cmd_status has stdout that must be decoded.
cmd_stdout = cmd_status.stdout.decode('utf-8')
return cmd_stdout
else: # Unsuccessful. Stdout will be '1'
failmsg = '%s failed.' % command_string
return failmsg
def splitlist(self, listname, basefilename, n):
"""
Split a long list into chunks and save chunks as a text file.
:param listname: The name of the list to split
:type listname:
:param basefilename:
:type basefilename:
:param n:
:type n:
:return:
:rtype:
"""
# Split the list into chunks
chunks = [listname[x:x + n] for x in range(0, len(listname), n)]
list_group = []
num_lists = len(chunks)
# Name and save the lists
for chunk, num in zip(chunks, range(0, num_lists)):
listdf = pd.DataFrame(chunk)
n = basefilename + '_list_' + str(num)
listdf.to_csv(n + ".txt", index=False, header=None)
list_group.append(n)
return list_group
def formatlist(self, input_list):
"""Remove spaces from list items and turn those spaces into underscores."""
output_list = []
for item in input_list:
item = str(item)
item = item.replace(" ", "_")
output_list.append(item)
return output_list
def makedirectory(self, path):
"""Creates path/parents and is compatible for python 3.4 and upwards."""
exist_ok = True
if not exist_ok and os.path.isdir(path):
with contextlib.suppress(OSError):
Path.mkdir(path, parents=True)
# def set_paths(self, parent, **children):
# raise NotImplementedError("This function is being developed.")
def csvtolist(self, csvfile, column_header='Organism'):
"""Turn column from csv file into a list."""
file = pd.read_csv(csvfile)
# Create a list name/variable and use list()
listfromcolumn = list(file[column_header])
return listfromcolumn
# Determine if there are processes with this file opened (Linux)
def has_handle(self, fpath):
for proc in psutil.process_iter():
try:
for item in proc.open_files():
if fpath == item.path:
return True
except Exception:
pass
return False
# Safely open a file thats being accessed by multiple processes
def safe_open(self, fpath, mode, iterations, cnt=0):
if cnt == iterations:
raise IOError
unsafe = self.has_handle(fpath)
if unsafe is True:
cnt = cnt + 1
self.safe_open(fpath=fpath, mode=mode, iterations=iterations, cnt=cnt)
else:
return open(file=fpath, mode=mode)