-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequenceML.py
88 lines (87 loc) · 5.79 KB
/
sequenceML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
def sequenceML(fasta_file, expression_file, test_size,\
random_state,\
filename,\
tokenizer):
"""summary_line
a automated framework for the sequence to expression
machine learning. it takes the fasta sequences, expression
file from the expression analysis and then writes the pickle
file for the automated machine learning. It goes through all
the classifier and gives you all the classifier and you can select
the one based on the precision and accuracy call. It writes pickle
file with the highest protocol
Keyword arguments:
argument -- description
fasta_file_: coming from the transcriptome assembly
test_size_ : define the test size for the machine split learning
random_state_ : define the random state for the machine learning
filename_ : for writing the machine learning classifier and the pickle files
tokenizer_ size of the token, while optimizing this i found the token size of
fasta sequences should not be more than 4-6 tokens if the length of the sequences
is not long.
Return: return_description
"""
testsize = test_size
random = random_state
token = int(tokenizer)
sequence_file_train_read = list(filter(None,[x.strip() for x in open(fasta_file).readlines()]))
sequence_train_dict = {}
for i in sequence_file_train_read:
if i.startswith(">"):
genome_path = i.strip()
if i not in sequence_train_dict:
sequence_train_dict[i] = ""
continue
sequence_train_dict[genome_path] += i.strip()
ids = list(map(lambda n: n.replace(">",""),sequence_train_dict.keys()))
sequences = list(sequence_train_dict.values())
sequence_dataframe = pd.DataFrame([(i,j)for i,j in zip(ids, sequences)]). \
rename(columns = {0: "ids", 1: "sequence"})
sequence_dataframe["expression"] = pd.read_csv(expression_file)
sequence_dataframe["class"] = sequence_dataframe["expression"].apply(lambda n: "1" if n == 0.1 else "2" \
if n == 0.2 else "3" if n == 0.4 else "4" if n == 0.6 \
else "5" if n == 0.8 else "6" if n == 0.3 \
else "7" if n == 0.5 else n)
def segement(x):
return [x[i:i+token] for i in range(len(x)-token+1)]
sequence_dataframe["segmentation"] = sequence_dataframe["sequence"].apply(lambda n: segement(n))
store_segmentation = sequence_dataframe["segmentation"].to_list()
for i in range(len(store_segmentation)):
store_segmentation[i] = ' '.join(store_segmentation[i])
store_segmentation_length = len(store_segmentation)
segmentation_class = sequence_dataframe["class"].values
vectorise_start = [3, 4, 5, 6]
vectorise_stop = [3, 4, 5, 6]
storing_count_vectorise = [CountVectorizer(ngram_range=(i,j)) for i,j in \
zip(vectorise_start, vectorise_stop)]
storing_count_vectorise_multiple_iterator = [(vectorise_start[i], vectorise_stop[j+1]) \
for i in range(len(vectorise_start)-1) \
for j in range(len(vectorise_stop)-1)]
multiple_iterators_optimization_store = [CountVectorizer(ngram_range=storing_count_vectorise_multiple_iterator[i])
for i in range(len(storing_count_vectorise_multiple_iterator))]
storing_assignments = [f"optimize_model{i} = {j}" for
i,j in enumerate(multiple_iterators_optimization_store,0)]
storing_model_transformation = [f"X{i} = {storing_assignments[i].split()[0]}.fit_transform(store_segmentation)" \
for i in range(len(storing_assignments))]
storing_classifier = [f"X_train{i}, X_test{i}, y_train{i}, y_test{i} = train_test_split({i},segmentation_class, test_size={testsize}, random_state={random})"
for i in range(len(([storing_model_transformation[i].split()[0]
for i in range(len(storing_model_transformation))])))]
alpha_predictions = list(np.linspace(0.1,1,num = len(storing_classifier)))
sequential_classifier = [f"sequential_classifier{i} = MultinomialNB({alpha_predictions[i]}).transform({storing_classifier[i].split()[0]} {storing_classifier[i].split()[2]})" \
for i in range(len(alpha_predictions))]
prediction_classifier = [f"pred{i} = sequential_classifier.predict({storing_classifier[i].split()[1]})"
for i in range(len(storing_classifier))]
f = open(filename, "wb")
pickle.dump(storing_classifier, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(storing_model_transformation, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(alpha_predictions, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(sequential_classifier, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(prediction_classifier, f, pickle.HIGHEST_PROTOCOL)
file.close()
return print("pickle file for the machine learning has been written")