Skip to content

Commit bc721a1

Browse files
committed
Supported --user_alignment #28
1 parent d8cb8e4 commit bc721a1

File tree

5 files changed

+61
-14
lines changed

5 files changed

+61
-14
lines changed

csubst/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '1.2.4'
1+
__version__ = '1.3.0'

csubst/csubst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ if __name__ == "__main__":
5757
# shared: common
5858
psr_co = argparse.ArgumentParser(add_help=False)
5959
psr_co.add_argument('--alignment_file', metavar='PATH', default='', type=str,
60-
help='default=%(default)s: PATH to input in-frame codon alignment.')
60+
help='default=%(default)s: PATH to in-frame codon alignment (FASTA format).')
6161
psr_co.add_argument('--rooted_tree_file', metavar='PATH', default='', type=str,
62-
help='default=%(default)s: PATH to input rooted tree. Tip labels should be consistent with --alignment_file.')
62+
help='default=%(default)s: PATH to input rooted tree (Newick format). Tip labels should be consistent with --alignment_file.')
6363
psr_co.add_argument('--genetic_code', metavar='INTEGER', type=int, required=False, default=1,
6464
help='default=%(default)s: NCBI codon table ID. 1 = "Standard". See here: '
6565
'https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi')
@@ -208,6 +208,8 @@ if __name__ == "__main__":
208208
help='default=%(default)s: E-value cutoff in the database searches. Applied to MMseqs2 and QBLAST.')
209209
site.add_argument('--database_minimum_identity', metavar='FLOAT', default=0.5, required=False, type=float,
210210
help='default=%(default)s: The minimum sequence identity for the database searches. Applied to MMseqs2. See https://search.rcsb.org/index.html#search-api')
211+
site.add_argument('--user_alignment', metavar='PATH', default=None, required=False, type=str,
212+
help='default=%(default)s: The user-provided alignment FASTA for the substitution mapping to protein structures.')
211213
site.add_argument('--remove_solvent', metavar='yes|no', default='yes', type=strtobool,
212214
help='default=%(default)s: Whether to remove solvent and small non-specific ligands. '
213215
'Used only with --pdb.')

csubst/main_site.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -681,9 +681,13 @@ def main_site(g):
681681
id_base = re.sub('.cif$', '', id_base)
682682
g['pdb_outfile_base'] = os.path.join(g['site_outdir'], 'csubst_site.' + id_base)
683683
parser_pymol.initialize_pymol(g=g)
684-
g['mafft_add_fasta'] = g['pdb_outfile_base']+'.fa'
685-
parser_pymol.write_mafft_map(g=g)
686-
df = parser_pymol.add_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map')
684+
if g['user_alignment'] is not None:
685+
g['mafft_add_fasta'] = g['user_alignment']
686+
df = parser_pymol.add_coordinate_from_user_alignment(df=df, user_alignment=g['mafft_add_fasta'])
687+
else:
688+
g['mafft_add_fasta'] = g['pdb_outfile_base']+'.fa'
689+
parser_pymol.write_mafft_alignment(g=g)
690+
df = parser_pymol.add_coordinate_from_mafft_map(df=df, mafft_map_file='tmp.csubst.pdb_seq.fa.map')
687691
df = parser_pymol.add_pdb_residue_numbering(df=df)
688692
g['session_file_path'] = g['pdb_outfile_base']+'.pymol.pse'
689693
parser_pymol.write_pymol_session(df=df, g=g)

csubst/parser_pymol.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import pandas
33
import pymol
44

5+
from Bio import SeqIO
6+
57
from io import StringIO
68
import copy
79
import os
@@ -26,7 +28,7 @@ def initialize_pymol(g):
2628
print('Loading PDB file: {}'.format(g['pdb']), flush=True)
2729
pymol.cmd.load(g['pdb'])
2830

29-
def write_mafft_map(g):
31+
def write_mafft_alignment(g):
3032
tmp_pdb_fasta = 'tmp.csubst.pdb_seq.fa'
3133
mafft_map_file = tmp_pdb_fasta+'.map'
3234
if os.path.exists(mafft_map_file):
@@ -45,16 +47,19 @@ def write_mafft_map(g):
4547
out_mafft = subprocess.run(cmd_mafft, stdout=subprocess.PIPE)
4648
with open(g['mafft_add_fasta'], 'w') as f:
4749
f.write(out_mafft.stdout.decode('utf8'))
50+
print('')
4851
for i in range(10):
4952
if os.path.exists(mafft_map_file):
50-
print('mafft map file was generated.', flush=True)
53+
print('MAFFT alignment file was generated: {}'.format(g['mafft_add_fasta']), flush=True)
5154
break
5255
else:
53-
print('mafft map file not detected. Waiting {:} sec'.format(i+1), flush=True)
56+
print('MAFFT alignment file not detected. Waiting {:} sec'.format(i+1), flush=True)
5457
time.sleep(1)
55-
txt = 'CSUBST does not exclude poorly aligned regions. ' \
56-
'Please carefully check {} before biological interpretation of substitution events.'
57-
print(txt.format(g['mafft_add_fasta']), flush=True)
58+
print('CSUBST does not exclude poorly aligned regions.', flush=True)
59+
print('Please carefully check the MAFFT alignment file before biological interpretation of substitution events.', flush=True)
60+
print('If manual adjustment is necessary, please correct the amino acid positions of database-derived sequences and use the updated MAFFT alignment file as input with --user_alignment.', flush=True)
61+
print('The CSUBST input sequences (i.e., sequences in the file specified by --alignment_file) should not be modified at this stage.', flush=True)
62+
print('', flush=True)
5863
if os.path.getsize(g['mafft_add_fasta'])==0:
5964
sys.stderr.write('File size of {} is 0. A wrong ID might be specified in --pdb.\n'.format(g['mafft_add_fasta']))
6065
sys.stderr.write('Exiting.\n')
@@ -91,9 +96,12 @@ def add_pdb_residue_numbering(df):
9196
if 'codon_site_'+key in df.columns:
9297
df = pandas.merge(df, residue_numberings[key], on='codon_site_'+key, how='left')
9398
df['codon_site_pdb_'+key] = df['codon_site_pdb_'+key].fillna(0).astype(int)
99+
print('The column "codon_site_**ID**" indicates the positions of codons/amino acids in the sequence "**ID**" in the input alignment. 0 = missing site.')
100+
print('The column "codon_site_pdb_**ID**" indicates the positions of codons/amino acids in the sequence "**ID**" in the PDB file. 0 = missing site.')
94101
return df
95102

96-
def add_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map'):
103+
def add_coordinate_from_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map'):
104+
print('Loading amino acid coordinates from: {}'.format(mafft_map_file), flush=True)
97105
with open(mafft_map_file, 'r') as f:
98106
map_str = f.read()
99107
map_list = map_str.split('>')[1:]
@@ -114,6 +122,39 @@ def add_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map'):
114122
df['aa_'+seq_name] = df.loc[:,'aa_'+seq_name].fillna('')
115123
return df
116124

125+
def add_coordinate_from_user_alignment(df, user_alignment):
126+
print('Loading amino acid coordinates from: {}'.format(user_alignment), flush=True)
127+
pdb_fasta = pymol.cmd.get_fastastr(selection='polymer.protein', state=-1, quiet=1)
128+
tmp_pdb_fasta = 'tmp.csubst.pdb_seq.fa'
129+
with open(tmp_pdb_fasta, 'w') as f:
130+
f.write(pdb_fasta)
131+
pdb_seqs = list(SeqIO.parse(open(tmp_pdb_fasta, 'r'), 'fasta'))
132+
user_seqs = list(SeqIO.parse(open(user_alignment, 'r'), 'fasta'))
133+
for user_seq in user_seqs:
134+
for pdb_seq in pdb_seqs:
135+
if user_seq.name!=pdb_seq.name:
136+
continue
137+
user_seq_str = str(user_seq.seq).replace('\n', '')
138+
pdb_seq_str = str(pdb_seq.seq).replace('\n', '')
139+
user_seq_counter = 0
140+
pdb_seq_counter = 0
141+
txt = 'The alignment length should match between --alignment_file ({} sites) and --user_alignment ({} sites)'
142+
assert len(user_seq_str)==df.shape[0], txt.format(df.shape[0], len(user_seq_str))
143+
df['aa_' + user_seq.name] = ''
144+
df['codon_site_' + user_seq.name] = 0
145+
while user_seq_counter <= df.shape[0]-1:
146+
if user_seq_str[user_seq_counter]=='-':
147+
user_seq_counter += 1
148+
continue
149+
if user_seq_str[user_seq_counter]==pdb_seq_str[pdb_seq_counter]:
150+
df.at[user_seq_counter, 'aa_' + user_seq.name] = user_seq_str[user_seq_counter]
151+
df.at[user_seq_counter, 'codon_site_' + user_seq.name] = pdb_seq_counter + 1
152+
user_seq_counter += 1
153+
pdb_seq_counter += 1
154+
else:
155+
pdb_seq_counter += 1
156+
return df
157+
117158
def calc_aa_identity(g):
118159
seqs = sequence.read_fasta(path=g['mafft_add_fasta'])
119160
seqnames = list(seqs.keys())

csubst/sequence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def write_alignment(outfile, mode, g, leaf_only=False):
5959
aln_tmp += translate_state(nlabel, mode, g)
6060
aln_out += aln_tmp+'\n'
6161
with open(outfile, 'w') as f:
62-
print('Writing alignment:', outfile, flush=True)
62+
print('Writing sequence alignment:', outfile, flush=True)
6363
f.write(aln_out)
6464

6565
def get_state_index(state, input_state, ambiguous_table):

0 commit comments

Comments
 (0)