-
Notifications
You must be signed in to change notification settings - Fork 0
/
mlk_tfbs_jaspar.py
121 lines (94 loc) · 5.07 KB
/
mlk_tfbs_jaspar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# TFBS Annotation
# Uses AnimalTFDB3.0 Binding Site predictions or JASPAR 2018 output to generate genbank annotations
# http://bioinfo.life.hust.edu.cn/AnimalTFDB/#!/tfbs_predict
import os
import argparse
import pandas as pd
import numpy
import datetime
import statistics
from collections import defaultdict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import IUPAC
parser = argparse.ArgumentParser()
parser.add_argument("tfbs_file", help="input file of the transcription factor binding site")
parser.add_argument("-s", "--score", help="score cutoff; defualt = 0", default = 0, type = float)
parser.add_argument("-r", "--relativescore", help="relative sscore cutoff; default = .8", default = .8, type = float)
parser.add_argument("-x", "--sequence", help="raw sequence to process")
parser.add_argument("-v", "--verbose", action='store_true', help="print detailed stats")
arguments = parser.parse_args()
# Read in TFBS file
tfbs_db = pd.read_csv(arguments.tfbs_file, sep=',', header=0)
#build genbank file entry
if arguments.sequence is not None:
sequence_string = arguments.sequence
sequence_object = Seq(sequence_string, IUPAC.unambiguous_dna)
#create a record
genbank_record = SeqRecord(sequence_object,
id='123456789', # random accession number
name=arguments.tfbs_file.rsplit('.')[0] + '-tfbs',
description='Autogenerated Genbank file annotated with TFBS data')#,
#date=datetime.datetime.now())
# data structure to keep track of previously added tfbs
#todo
tfbs_dict_previous_added = defaultdict(list)
all_scores = []
all_relativescores = []
passed_scores = []
passed_relativescores = []
print('')
print('Thresholding cutoffs at: ')
print('score: ', arguments.score)
print('relativescore: ', arguments.relativescore)
for entry in tfbs_db.iterrows():
all_scores.append(entry[1]['Score'])
all_relativescores.append(entry[1]['Relative score'])
#cuttoff by score
if float(entry[1]['Score']) > arguments.score and float(entry[1]['Relative score']) > float(arguments.relativescore):
#check for redundancy
if entry[1]['Name'] not in tfbs_dict_previous_added.keys() or (entry[1]['Start'], entry[1]['End']) not in tfbs_dict_previous_added[entry[1]['Name']]:
#append
tfbs_dict_previous_added[entry[1]['Name']].append((entry[1]['Start'], entry[1]['End']))
feature = SeqFeature(FeatureLocation(start=entry[1]['Start'], end=entry[1]['End']), type='TFBS', strand=0)
feature.qualifiers['note'] = str(entry[1]['Score']) + ' [' + str(entry[1]['Relative score']) + ']' + ' ' + entry[1]['Strand']+ entry[1]['Predicted sequence']
feature.qualifiers['label'] = str(entry[1]['Name'])
#color picker
if entry[1]['Relative score'] < .8 :
feature.qualifiers['ApEinfo_revcolor'] = '#FFFFFF' #white
feature.qualifiers['ApEinfo_fwdcolor'] = '#FFFFFF' #white
if entry[1]['Relative score'] >= .8 and entry[1]['Relative score'] <.85:
feature.qualifiers['ApEinfo_revcolor'] = '#D3D3D3' #light gray
feature.qualifiers['ApEinfo_fwdcolor'] = '#D3D3D3' #light gray
if entry[1]['Relative score'] >= .85 and entry[1]['Relative score'] <.9:
feature.qualifiers['ApEinfo_revcolor'] = '#808080' #gray
feature.qualifiers['ApEinfo_fwdcolor'] = '#808080' #gray
if entry[1]['Relative score'] >= .9 and entry[1]['Relative score'] <.95:
feature.qualifiers['ApEinfo_revcolor'] = '#505050' #dark gray
feature.qualifiers['ApEinfo_fwdcolor'] = '#505050' #dark gray
if entry[1]['Relative score'] >= .95 and entry[1]['Relative score'] <1:
feature.qualifiers['ApEinfo_revcolor'] = '#000000' #black
feature.qualifiers['ApEinfo_fwdcolor'] = '#000000' #black
genbank_record.features.append(feature)
passed_scores.append(entry[1]['Score'])
passed_scores.append(entry[1]['Relative score'])
else:
pass
print('duplicate entry')
print('')
print('added ', len(tfbs_dict_previous_added.keys()), ' features to ', arguments.tfbs_file.rsplit('.')[0])
if arguments.verbose is True:
# print run metrics
print('')
print('Stats of Entire Set: (score, relative score)')
print('Ranges:', ((min(all_scores),max(all_scores)), (min(all_relativescores), (max(all_relativescores)))))
print('Means:', statistics.mean(all_scores), statistics.mean(all_relativescores))
print('')
print('Stats of Passed Set: (score, relative score)')
print('Ranges:', (min(passed_scores),max(passed_scores)), (min(passed_relativescores),max(passed_relativescores)))
print('Means:', statistics.mean(passed_scores), statistics.mean(passed_relativescores))
# Save as GenBank file
output_file = open(arguments.tfbs_file.rsplit('.')[0] + '-tfbs_annotated.gb', 'w')
SeqIO.write(genbank_record, output_file, 'genbank')