Skip to content

Commit 2ff831e

Browse files
DomDellaSeraDomDellaSera
DomDellaSera
authored and
DomDellaSera
committed
heavy refacturing; added cache and tests;added old bokeh stuff
1 parent e4315c1 commit 2ff831e

File tree

5 files changed

+117
-121
lines changed

5 files changed

+117
-121
lines changed

cache.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
2+
def getPandaRow():
3+
import numpy as np
4+
row = np.array([[1, 100.0, 'A8NJZ7', 1, 1, 122]], dtype=object)
5+
return(row)
6+
7+
def getHeaders():
8+
9+
headers = ['>sp|P90689|ACT_BRUMA Actin OS=Brugia malayi PE=1 SV=1', '>sp|A8Q3T2|ASNA_BRUMA ATPase ASNA1 homolog OS=Brugia malayi GN=Bm1_42140 PE=3 SV=1', '>sp|A8PWB6|BOP1_BRUMA Ribosome biogenesis protein BOP1 homolog OS=Brugia malayi GN=Bm1_36175 PE=3 SV=1', '>sp|P29030|CHIT_BRUMA Endochitinase OS=Brugia malayi PE=1 SV=1', '>sp|A8PB32|CLP1_BRUMA Protein CLP1 homolog OS=Brugia malayi GN=Bm1_20975 PE=3 SV=1', '>sp|A8PJX4|CLU_BRUMA Clustered mitochondria protein homolog OS=Brugia malayi GN=Bm1_28595 PE=3 SV=2', '>sp|A8QFY9|DDRGK_BRUMA DDRGK domain-containing protein 1 OS=Brugia malayi GN=Bm1_54325 PE=3 SV=1', '>sp|Q27450|CYP1_BRUMA Peptidyl-prolyl cis-trans isomerase 1 OS=Brugia malayi GN=CYP-1 PE=1 SV=1', '>sp|A8QBB1|DRE2_BRUMA Anamorsin homolog OS=Brugia malayi GN=Bm1_48140 PE=3 SV=1', '>sp|A8QDN3|EIF3K_BRUMA Eukaryotic translation initiation factor 3 subunit K OS=Brugia malayi GN=Bm1_52955 PE=3 SV=1', '>sp|A8PHP4|EIF3L_BRUMA Eukaryotic translation initiation factor 3 subunit L OS=Brugia malayi GN=Bm1_25770 PE=3 SV=1', '>sp|A8QE76|EFTS_BRUMA Elongation factor Ts, mitochondrial OS=Brugia malayi GN=Bm1_50845 PE=3 SV=1', '>sp|A8PKH2|EIF3A_BRUMA Eukaryotic translation initiation factor 3 subunit A OS=Brugia malayi GN=Bm1_29045 PE=3 SV=1', '>sp|A8NY27|EIF3E_BRUMA Eukaryotic translation initiation factor 3 subunit E OS=Brugia malayi GN=Bm1_11985 PE=3 SV=2', '>sp|A8NS61|EIF3G_BRUMA Eukaryotic translation initiation factor 3 subunit G OS=Brugia malayi GN=Bm1_08615 PE=3 SV=1', '>sp|A8QCY3|EIF3H_BRUMA Eukaryotic translation initiation factor 3 subunit H OS=Brugia malayi GN=Bm1_50170 PE=3 SV=1', '>sp|A8QBF3|EIF3I_BRUMA Eukaryotic translation initiation factor 3 subunit I OS=Brugia malayi GN=Bm1_48300 PE=3 SV=1', '>sp|Q4VWF8|GPMI_BRUMA 2,3-bisphosphoglycerate-independent phosphoglycerate mutase OS=Brugia malayi GN=ipgm-1 PE=1 SV=1', '>sp|P67877|GPXC_BRUMA Cuticular glutathione peroxidase OS=Brugia malayi PE=1 SV=1', '>sp|A8NS89|GOB1_BRUMA Trehalose-phosphatase OS=Brugia malayi GN=Bm1_08695 PE=1 SV=1', '>sp|A8QCH0|FEN1_BRUMA Flap endonuclease 1 OS=Brugia malayi GN=FEN1 PE=3 SV=1', '>sp|Q93142|FAR1_BRUMA Fatty-acid and retinol-binding protein 1 OS=Brugia malayi GN=far-1 PE=1 SV=1', '>sp|A8QGZ7|FBSP1_BRUMA F-box/SPRY domain-containing protein 1 OS=Brugia malayi GN=Bm1_56115 PE=3 SV=1', '>sp|P48812|G3P_BRUMA Glyceraldehyde-3-phosphate dehydrogenase OS=Brugia malayi GN=G3PD PE=1 SV=1', '>sp|A8PJJ2|GATC_BRUMA Glutamyl-tRNA(Gln) amidotransferase subunit C, mitochondrial OS=Brugia malayi GN=Bm1_27920 PE=3 SV=1', '>sp|A8QCE7|GUF1_BRUMA Translation factor GUF1 homolog, mitochondrial OS=Brugia malayi GN=Bm1_49530 PE=3 SV=1', '>sp|P27541|HSP70_BRUMA Heat shock 70 kDa protein OS=Brugia malayi GN=HSP70 PE=3 SV=1', '>sp|A0A0K0JFP3|HXK_BRUMA Hexokinase OS=Brugia malayi GN=Bm4678 PE=1 SV=1', '>sp|A8PGQ3|NARF_BRUMA Probable cytosolic Fe-S cluster assembly factor Bm1_25010 OS=Brugia malayi GN=Bm1_25010 PE=3 SV=1', '>sp|Q01202|MYSP_BRUMA Paramyosin OS=Brugia malayi PE=2 SV=2', '>sp|P48817|NDK_BRUMA Nucleoside diphosphate kinase OS=Brugia malayi GN=NDK PE=2 SV=1', '>sp|A8PV03|SLX1_BRUMA Structure-specific endonuclease subunit SLX1 homolog OS=Brugia malayi GN=Bm1_35165 PE=3 SV=1', '>sp|A8QFF6|SPAST_BRUMA Probable spastin homolog Bm1_53365 OS=Brugia malayi GN=Bm1_53365 PE=3 SV=1', '>sp|A8NJZ7|SCOC_BRUMA Short coiled-coil protein homolog OS=Brugia malayi GN=Bm1_04115 PE=3 SV=1', '>sp|P90703|RLA2_BRUMA 60S acidic ribosomal protein P2 OS=Brugia malayi GN=rpp-2 PE=3 SV=1', '>sp|P90707|RS23_BRUMA 40S ribosomal protein S23 OS=Brugia malayi GN=rps-23 PE=2 SV=1', '>sp|A8PJ38|RS3A_BRUMA 40S ribosomal protein S3a OS=Brugia malayi GN=Bm1_27225 PE=3 SV=1', '>sp|A8Q2H5|RSSA_BRUMA 40S ribosomal protein SA OS=Brugia malayi GN=Bm1_41245 PE=3 SV=2', '>sp|A8QC60|RTCB_BRUMA tRNA-splicing ligase RtcB homolog OS=Brugia malayi GN=Bm1_49220 PE=3 SV=1', '>sp|P90697|TCTP_BRUMA Translationally-controlled tumor protein homolog OS=Brugia malayi PE=2 SV=1', '>sp|P48822|TDX1_BRUMA Thioredoxin peroxidase 1 OS=Brugia malayi GN=TSA1 PE=2 SV=1', '>sp|Q17172|TDX2_BRUMA Thioredoxin peroxidase 2 OS=Brugia malayi GN=tsa-2 PE=2 SV=2', '>sp|P10723|SYNC_BRUMA Asparagine--tRNA ligase, cytoplasmic OS=Brugia malayi PE=1 SV=1', '>sp|Q9Y193|TIM13_BRUMA Mitochondrial import inner membrane translocase subunit Tim13 OS=Brugia malayi GN=TIM13 PE=3 SV=1', '>sp|A8NFF0|TRMB_BRUMA tRNA (guanine-N(7)-)-methyltransferase OS=Brugia malayi GN=Bm1_01445 PE=3 SV=1', '>sp|A8Q8J2|UFC1_BRUMA Ubiquitin-fold modifier-conjugating enzyme 1 OS=Brugia malayi GN=Bm1_46190 PE=3 SV=1', '>sp|A8Q8M5|UFM1_BRUMA Ubiquitin-fold modifier 1 OS=Brugia malayi GN=Bm1_46275 PE=3 SV=1', '>sp|A8Q2R5|WDR48_BRUMA WD repeat-containing protein 48 homolog OS=Brugia malayi GN=Bm1_41555 PE=3 SV=2', '>sp|Q17162|VINC_BRUMA Vinculin OS=Brugia malayi PE=2 SV=1', '>sp|A8QB65|WDR12_BRUMA Ribosome biogenesis protein WDR12 homolog OS=Brugia malayi GN=Bm1_47965 PE=3 SV=1', '>sp|O77049|JTB_BRUMA Protein JTB OS=Brugia malayi GN=JTB PE=2 SV=1', '>sp|A8PF69|LIAS_BRUMA Lipoyl synthase, mitochondrial OS=Brugia malayi GN=Bm1_23910 PE=3 SV=1', '>sp|A8QCE4|LST2_BRUMA Lateral signaling target protein 2 homolog OS=Brugia malayi GN=Bm1_49520 PE=3 SV=1', '>sp|P91850|MIFH_BRUMA Macrophage migration inhibitory factor homolog OS=Brugia malayi GN=Bm1_28435 PE=3 SV=4', '>sp|A8PW87|NUBP1_BRUMA Cytosolic Fe-S cluster assembly factor NUBP1 homolog OS=Brugia malayi GN=Bm1_36105 PE=3 SV=1', '>sp|A8QFQ3|NO66_BRUMA Bifunctional lysine-specific demethylase and histidyl-hydroxylase NO66 OS=Brugia malayi GN=Bm1_53875 PE=3 SV=1', '>sp|A8QHQ0|PESC_BRUMA Pescadillo homolog OS=Brugia malayi GN=Bm1_57380 PE=3 SV=2', '>sp|Q8IHI1|PSF2_BRUMA Probable DNA replication complex GINS protein PSF2 OS=Brugia malayi GN=BMBAC01P19.06 PE=3 SV=1', '>sp|A8QE42|PURA_BRUMA Adenylosuccinate synthetase OS=Brugia malayi GN=Bm1_50695 PE=3 SV=1', '>sp|P38542|RAN_BRUMA GTP-binding nuclear protein Ran OS=Brugia malayi GN=Bm1_44725 PE=2 SV=2', '>sp|Q93140|RL23_BRUMA 60S ribosomal protein L23 OS=Brugia malayi GN=RPL23 PE=2 SV=1', '>sp|P90702|RL44_BRUMA 60S ribosomal protein L44 OS=Brugia malayi GN=rpl-44 PE=3 SV=3', '>sp|A8P7J8|U518_BRUMA UPF0518 protein Bm1_18400 OS=Brugia malayi GN=Bm1_18400 PE=3 SV=1', '>sp|A8NJ91|U729_BRUMA UPF0729 protein Bm1_03610 OS=Brugia malayi GN=Bm1_03610 PE=3 SV=1']
10+
return(headers)
11+
12+
13+
def subsetted_Headers():
14+
headers = [['P90689', 'Actin', 'NaGN'], ['A8Q3T2', 'ATPase ASNA1 homolog', 'Bm1_42140'], ['A8PWB6', 'Ribosome biogenesis protein BOP1 homolog', 'Bm1_36175'], ['P29030', 'Endochitinase', 'NaGN'], ['A8PB32', 'Protein CLP1 homolog', 'Bm1_20975'], ['A8PJX4', 'Clustered mitochondria protein homolog', 'Bm1_28595'], ['A8QFY9', 'DDRGK domain-containing protein 1', 'Bm1_54325'], ['Q27450', 'Peptidyl-prolyl cis-trans isomerase 1', 'CYP-1'], ['A8QBB1', 'Anamorsin homolog', 'Bm1_48140'], ['A8QDN3', 'Eukaryotic translation initiation factor 3 subunit K', 'Bm1_52955'], ['A8PHP4', 'Eukaryotic translation initiation factor 3 subunit L', 'Bm1_25770'], ['A8QE76', 'Elongation factor Ts, mitochondrial', 'Bm1_50845'], ['A8PKH2', 'Eukaryotic translation initiation factor 3 subunit A', 'Bm1_29045'], ['A8NY27', 'Eukaryotic translation initiation factor 3 subunit E', 'Bm1_11985'], ['A8NS61', 'Eukaryotic translation initiation factor 3 subunit G', 'Bm1_08615'], ['A8QCY3', 'Eukaryotic translation initiation factor 3 subunit H', 'Bm1_50170'], ['A8QBF3', 'Eukaryotic translation initiation factor 3 subunit I', 'Bm1_48300'], ['Q4VWF8', '2,3-bisphosphoglycerate-independent phosphoglycerate mutase', 'ipgm-1'], ['P67877', 'Cuticular glutathione peroxidase', 'NaGN'], ['A8NS89', 'Trehalose-phosphatase', 'Bm1_08695'], ['A8QCH0', 'Flap endonuclease 1', 'FEN1'], ['Q93142', 'Fatty-acid and retinol-binding protein 1', 'far-1'], ['A8QGZ7', 'F-box/SPRY domain-containing protein 1', 'Bm1_56115'], ['P48812', 'Glyceraldehyde-3-phosphate dehydrogenase', 'G3PD'], ['A8PJJ2', 'Glutamyl-tRNA(Gln) amidotransferase subunit C, mitochondrial', 'Bm1_27920'], ['A8QCE7', 'Translation factor GUF1 homolog, mitochondrial', 'Bm1_49530'], ['P27541', 'Heat shock 70 kDa protein', 'HSP70'], ['A0A0K0JFP3', 'Hexokinase', 'Bm4678'], ['A8PGQ3', 'Probable cytosolic Fe-S cluster assembly factor Bm1_25010', 'Bm1_25010'], ['Q01202', 'Paramyosin', 'NaGN'], ['P48817', 'Nucleoside diphosphate kinase', 'NDK'], ['A8PV03', 'Structure-specific endonuclease subunit SLX1 homolog', 'Bm1_35165'], ['A8QFF6', 'Probable spastin homolog Bm1_53365', 'Bm1_53365'], ['A8NJZ7', 'Short coiled-coil protein homolog', 'Bm1_04115'], ['P90703', '60S acidic ribosomal protein P2', 'rpp-2'], ['P90707', '40S ribosomal protein S23', 'rps-23'], ['A8PJ38', '40S ribosomal protein S3a', 'Bm1_27225'], ['A8Q2H5', '40S ribosomal protein SA', 'Bm1_41245'], ['A8QC60', 'tRNA-splicing ligase RtcB homolog', 'Bm1_49220'], ['P90697', 'Translationally-controlled tumor protein homolog', 'NaGN'], ['P48822', 'Thioredoxin peroxidase 1', 'TSA1'], ['Q17172', 'Thioredoxin peroxidase 2', 'tsa-2'], ['P10723', 'Asparagine--tRNA ligase, cytoplasmic', 'NaGN'], ['Q9Y193', 'Mitochondrial import inner membrane translocase subunit Tim13', 'TIM13'], ['A8NFF0', 'tRNA (guanine-N(7)-)-methyltransferase', 'Bm1_01445'], ['A8Q8J2', 'Ubiquitin-fold modifier-conjugating enzyme 1', 'Bm1_46190'], ['A8Q8M5', 'Ubiquitin-fold modifier 1', 'Bm1_46275'], ['A8Q2R5', 'WD repeat-containing protein 48 homolog', 'Bm1_41555'], ['Q17162', 'Vinculin', 'NaGN'], ['A8QB65', 'Ribosome biogenesis protein WDR12 homolog', 'Bm1_47965'], ['O77049', 'Protein JTB', 'JTB'], ['A8PF69', 'Lipoyl synthase, mitochondrial', 'Bm1_23910'], ['A8QCE4', 'Lateral signaling target protein 2 homolog', 'Bm1_49520'], ['P91850', 'Macrophage migration inhibitory factor homolog', 'Bm1_28435'], ['A8PW87', 'Cytosolic Fe-S cluster assembly factor NUBP1 homolog', 'Bm1_36105'], ['A8QFQ3', 'Bifunctional lysine-specific demethylase and histidyl-hydroxylase NO66', 'Bm1_53875'], ['A8QHQ0', 'Pescadillo homolog', 'Bm1_57380'], ['Q8IHI1', 'Probable DNA replication complex GINS protein PSF2', 'BMBAC01P19.06'], ['A8QE42', 'Adenylosuccinate synthetase', 'Bm1_50695'], ['P38542', 'GTP-binding nuclear protein Ran', 'Bm1_44725'], ['Q93140', '60S ribosomal protein L23', 'RPL23'], ['P90702', '60S ribosomal protein L44', 'rpl-44'], ['A8P7J8', 'UPF0518 protein Bm1_18400', 'Bm1_18400'], ['A8NJ91', 'UPF0729 protein Bm1_03610', 'Bm1_03610']]
15+
return(headers)
16+
17+
18+
19+
20+
21+
22+
23+
24+
25+
26+
27+
28+
29+
30+
31+
32+
33+
34+

functional_tests.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from table_generator import extract_headers,extract_Header_Info
2+
3+
4+
5+
headers =extract_headers()
6+
#print(headers)
7+
print(extract_Header_Info(headers))
8+
print('done')

main.py

+31-104
Original file line numberDiff line numberDiff line change
@@ -7,110 +7,37 @@
77
from bokeh.plotting import figure
88
from bokeh.io import push_notebook,show,output_notebook
99
from ipywidgets import interact
10+
# MOST OF THIS FILE IS BEING PORTED TO getfilenames.py and table_generator.py
1011

12+
import pandas as pd
13+
import numpy as np
14+
from bokeh.plotting import figure
15+
from bokeh.io import push_notebook,show,output_notebook
16+
from ipywidgets import interact
17+
18+
19+
protein_name_table = pd.read_csv("protein_names.csv")
20+
disorder_stats = pd.read_csv("tmp.csv")
21+
proteintable_bydisorder =disorder_stats.sort("pct_disord", ascending=False)
22+
y = proteintable_bydisorder.pct_disord
23+
x = proteintable_bydisorder.tot_aa
24+
bokeh_plot = figure(plot_width=1024, plot_height=576)
25+
r = bokeh_plot.scatter(x, y, color="#2222aa")
1126

12-
with open("c_elegans_manual_plus_iso/") as uniprot:
13-
uniprot =uniprot.read()
14-
seqs = re.(".*", uniprot[1:5000])
15-
print(seqs)
16-
#print(uniprot[1:1200])
17-
uni_subset =uniprot[1:10000]
18-
def uniprot_fasta_tableconv(fasta_file):
19-
fasta_file = uni_subset.split(">")
20-
for i in fasta_file:
21-
tmp=i.split("|")
22-
tmp = [tmp[0]+tmp[1], tmp[2]]
23-
tmp =
24-
#tmp = i.split("OS=")
25-
#tmp[1] = tmp[1].split("GN=")
26-
#tmp[]
27-
28-
29-
30-
31-
print(tmp)
27+
def updatey(f):
28+
if f == "Percent Disorder": func = proteintable_bydisorder.pct_disord
29+
elif f == "Number of Disorderd Segments" : func = proteintable_bydisorder.seg_disord
30+
elif f == "30 Residues Disordered" : func = proteintable_bydisorder.thirty_disord
31+
elif f == "50 Residues Disordered" : func = proteintable_bydisorder.fifty_disord
32+
elif f == "Total Amino Acids" : func = proteintable_bydisorder.tot_aa
33+
r.data_source.data["y"] = func
34+
def updatex(f):
35+
if f == "Percent Disorder": funcx = proteintable_bydisorder.pct_disord
36+
elif f == "Number of Disorderd Segments" : funcx = proteintable_bydisorder.seg_disord
37+
elif f == "30 Residues Disordered" : funcx = proteintable_bydisorder.thirty_disord
38+
elif f == "50 Residues Disordered" : funcx = proteintable_bydisorder.fifty_disord
39+
elif f == "Total Amino Acids" : funcx = proteintable_bydisorder.tot_aa
40+
r.data_source.data["x"] = funcx
41+
push_notebook()
3242

33-
def protein_stats(stats_file):
34-
with open(stats_file) as stats:
35-
stats = stats.read()
36-
stats = stats.rstrip()
37-
stats = stats.split("\n")
38-
39-
40-
stats = [x.split(":") for x in stats]
41-
count = -1
42-
43-
44-
45-
#For each row replace the value in the second item with a stripped version
46-
47-
#print(stats)
48-
error = None
49-
50-
for i in stats:
51-
count += 1
52-
value = stats[count][1]
53-
stats[count][1] = value.lstrip()
54-
if value == "":
55-
stats[count][1] = None
56-
continue
57-
try:
58-
float(i[1])
59-
except:
60-
61-
stats[count][1] = stats[count][1].split(" ")
62-
#if error != None
63-
# stats
64-
#[float(x[1]) for x in stats]
65-
#stats = [x[1].rstrip(" ") for x in stats]
66-
#stats = [x.rstrip(" ") for x in stats if " " in x]
67-
68-
pro_name = stats_file[:-12]
69-
70-
"""if stats[5][1]!= None:
71-
df_range = len(stats[5][1])
72-
else:
73-
df_range = 1"""
74-
stat_table = pd.DataFrame({"protein" : pd.Series(pro_name, index=list(range(1)), dtype=object),
75-
"tot_aa" : np.array(stats[0][1], dtype="int32"),
76-
"pct_disord" : np.array(stats[1][1], dtype="float32"),
77-
"thirty_disord" : np.array(stats[2][1], dtype="int32"),
78-
"fifty_disord" : np.array(stats[3][1], dtype="int32"),
79-
"seg_disord" : np.array(stats[4][1], dtype="int32")#,
80-
#"len_dist" : pd.Series(stats[5][1], dtype="float32")
81-
82-
83-
})
84-
#print(stats_file, "successfully put into table form")
85-
return(stat_table)
86-
datf=None
87-
#handle = open("c_elegans_manual_plus_iso/filenames.txt")
88-
#[print(x) for x in handle]
89-
#row_index = [x[:-13] for x in handle]
90-
#row_index = pd.Index(row_index)
91-
#print(row_index)
92-
handle = open("filenames.txt")
93-
counter = 0
94-
for i in handle:
95-
#print(i)
96-
97-
protein=i.rstrip()
98-
#protein += ".stats"
99-
#print(protein)
100-
#row_index = {x for x in protein}
101-
#print(row_index)
102-
try:
103-
single_row = protein_stats(protein)
104-
except:
105-
#print(protein, "failed")
106-
continue
107-
108-
if datf is None:
109-
datf = single_row
110-
else:
111-
datf = datf.append(single_row)
112-
if counter == 20000: break
113-
counter += 1
114-
115-
#print(datf)
116-
handle.close()
43+
show(bokeh_plot, notebook_handle=True)

table_generator.py

+30-15
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import re
66
import os
7+
#import pdb
78
logger = logging.getLogger()
89
logging.disable(logging.DEBUG)
910

@@ -38,6 +39,7 @@ def iterative_generation(dir = "tests/b_malayi_reviewed/"):
3839
#logging.debug(data_Table)
3940
return(data_Table)
4041

42+
"""
4143
def protein_Names_From_Fasta(uniprot_fasta = "fasta/brugia_malayi_reviewed.fasta"):
4244
# Purpose:
4345
# To scan the fasta header and extract the protein name, gene name , etc
@@ -73,31 +75,44 @@ def protein_Names_From_Fasta(uniprot_fasta = "fasta/brugia_malayi_reviewed.fasta
7375
except:
7476
gene_name=None #Not unifrom dataset
7577
gene_ontology.append([filename, protein_name, gene_name])
76-
with open('fasta_tmp.fasta', 'w') as tmp:
77-
tmp.write(uniprot)
78+
#with open('fasta_tmp.fasta', 'w') as tmp:
79+
# tmp.write(uniprot)
7880
7981
8082
return(gene_ontology)
83+
"""
8184

82-
def protein_Name_table_maker(uniprot_fasta = "fasta/brugia_malayi_reviewed.fasta"):
83-
85+
def extract_headers(uniprot_fasta = "fasta/brugia_malayi_reviewed.fasta"):
86+
with open(uniprot_fasta) as fasta_raw:
87+
fasta_raw = fasta_raw.read()
88+
return(re.findall(">.*", fasta_raw))
89+
#pdb.set_trace()
8490
# Purpose:
8591
#This takes our original fasta from uniprot and tries to return a table with the filename, protein name, and gene name
8692
# THIS NEEDS HEAVY DEBUGGING -- I'M NOT SURE YET WHAT IT DOES EXACTLY IN RELATION TO THE PRIOR FUNCTION
93+
def extract_Header_Info(fasta_headers):
94+
extracted_Info = []
95+
for header in fasta_headers:
96+
#print(header)
97+
filename=[]
98+
split_header = header.split(" ")
99+
filename.append(split_header.pop(0))
100+
filename = re.findall(">sp\|(.*)\|", header)[0]
101+
header_Concat = " ".join(split_header)
102+
103+
protein_name = re.findall("(.*\s*.*)\sOS", header_Concat)[0]
104+
try:
105+
gene_name= re.findall(">sp.*GN=(.*) PE", header)[0]
106+
except:
107+
gene_name="NaGN"
108+
extracted_Info.append([filename,protein_name,gene_name])
109+
return(extracted_Info)
87110

88-
filename =re.findall(">sp\|(.*)\|", uniprot_fasta)[0]
89-
print(filename)
90-
#exit()
91-
protein_name =re.findall(">sp.*L (.*) OS", uniprot_fasta)[0]
92-
try:
93-
gene_name= re.findall(">sp.*GN=(.*) PE", uniprot_fasta)[0]
94-
except:
95-
96-
gene_name=""
111+
def gene_Name_Format_Correction(list_Header):
97112
gene_name_length=len(gene_name)
98113
if gene_name in protein_name and gene_name_length!=0:
99-
100-
# Many of these fasta files have the gen name inside of them so here I'm searching for a gene name in the
114+
115+
# Many of these fasta files have the gen name inside of them so here I'm searching for a gene name in the
101116
# protein name and extracting it if it exists in two places
102117
subtractor =len(gene_name)
103118
length =len(protein_name)

0 commit comments

Comments
 (0)