-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransformProtein.py
97 lines (78 loc) · 3.23 KB
/
transformProtein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt
import time
import random
from tokenizer import Tokenizer
from model_manager import VocabularyManager
class transformProtein:
def __init__(self, stop_token = 4, mapfold = 'mapping_files/', maxSampleLength = 512,
verbose = False, dropRate = 0.2, seqonly = False, noflipseq = False):
self.stop_token = stop_token
self.maxSampleLength = maxSampleLength
self.verbose = verbose
self.dropRate = dropRate
self.seqonly = seqonly
self.noflipseq = noflipseq
self.tokenizer = Tokenizer()
vocab_manager = VocabularyManager()
self.oneEncoderLength = vocab_manager.vocab_size -1
def transformSeq(self, seq, prob = 0.2):
"""
Transform the amino acid sequence. Currently only reverses seq--eventually include substitutions/dropout
"""
if self.noflipseq:
return seq
if np.random.random()>(1-prob):
seq = seq[::-1]
return seq
def transformKwSet(self, kws, drop = 0.2):
"""
Filter kws, dropout, and replace with lineage (including term at end)
"""
for kw in kws:
if np.random.random()<drop:
kws.remove(kw)
return kws
def transformSample(self, proteinDict):
"""
Function to transform/augment a sample.
Padding with all zeros
Returns an encoded sample (taxa's,kw's,sequence) and the existence level to multiply weights
"""
existence = 1
kws = self.transformKwSet(proteinDict['kw'], drop = self.dropRate)
if proteinDict['ex'] in [4, 5]:
existence += 1
seq = self.transformSeq(proteinDict['seq'])
seq = list(self.tokenizer.aa_to_ctrl_idx[seq[i]] for i in range(len(seq)))
seq = np.array(kws + seq + [self.stop_token]).astype(int)
thePadIndex = len(seq)
encodedSample = np.full(self.maxSampleLength, self.oneEncoderLength, dtype = int)
encodedSample[:len(seq)] = seq
if self.verbose:
print('Raw Data')
for k in proteinDict:
print('--------',k,'--------')
print(proteinDict[k])
print('Transformed Sample -------')
print('Seq',seq)
print('Existence', existence)
print('KWs',kws)
print('encodedSample',encodedSample)
print('thePadIndex', thePadIndex)
return encodedSample, existence, thePadIndex
if __name__ == "__main__":
with open('data_enzymes_classes/all_families_data/test_ec_1.p','rb') as handle:
test_chunk = pickle.load(handle)
obj = transformProtein(verbose=False, dropRate = 0.0, seqonly = False)
for key in test_chunk:
encodedSample, existence, thePadIndex = obj.transformSample(test_chunk[key])
print('max sample len', obj.maxSampleLength)
print('encodedSample: ', encodedSample)
print('existence: ', existence)
print('thePadIndex', thePadIndex)
print('encodedSample + pad index: ', encodedSample[:thePadIndex])
print('encodedSample - pad index: ', encodedSample[thePadIndex:])
break