Supported --user_alignment #28

kfuku52 · kfuku52 · commit bc721a11a31d · 2023-02-07T16:01:15.000+01:00
diff --git a/csubst/__init__.py b/csubst/__init__.py
@@ -1 +1 @@
-__version__ = '1.2.4'
+__version__ = '1.3.0'
diff --git a/csubst/csubst b/csubst/csubst
@@ -57,9 +57,9 @@ if __name__ == "__main__":
     # shared: common
     psr_co = argparse.ArgumentParser(add_help=False)
     psr_co.add_argument('--alignment_file', metavar='PATH', default='', type=str,
-                       help='default=%(default)s: PATH to input in-frame codon alignment.')
+                       help='default=%(default)s: PATH to in-frame codon alignment (FASTA format).')
     psr_co.add_argument('--rooted_tree_file', metavar='PATH', default='', type=str,
-                       help='default=%(default)s: PATH to input rooted tree. Tip labels should be consistent with --alignment_file.')
+                       help='default=%(default)s: PATH to input rooted tree (Newick format). Tip labels should be consistent with --alignment_file.')
     psr_co.add_argument('--genetic_code', metavar='INTEGER', type=int, required=False, default=1,
                        help='default=%(default)s: NCBI codon table ID. 1 = "Standard". See here: '
                             'https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi')
@@ -208,6 +208,8 @@ if __name__ == "__main__":
                       help='default=%(default)s: E-value cutoff in the database searches. Applied to MMseqs2 and QBLAST.')
     site.add_argument('--database_minimum_identity', metavar='FLOAT', default=0.5, required=False, type=float,
                       help='default=%(default)s: The minimum sequence identity for the database searches. Applied to MMseqs2. See https://search.rcsb.org/index.html#search-api')
+    site.add_argument('--user_alignment', metavar='PATH', default=None, required=False, type=str,
+                      help='default=%(default)s: The user-provided alignment FASTA for the substitution mapping to protein structures.')
     site.add_argument('--remove_solvent', metavar='yes|no', default='yes', type=strtobool,
                       help='default=%(default)s: Whether to remove solvent and small non-specific ligands. '
                            'Used only with --pdb.')
diff --git a/csubst/main_site.py b/csubst/main_site.py
@@ -681,9 +681,13 @@ def main_site(g):
             id_base = re.sub('.cif$', '', id_base)
             g['pdb_outfile_base'] = os.path.join(g['site_outdir'], 'csubst_site.' + id_base)
             parser_pymol.initialize_pymol(g=g)
-            g['mafft_add_fasta'] = g['pdb_outfile_base']+'.fa'
-            parser_pymol.write_mafft_map(g=g)
-            df = parser_pymol.add_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map')
+            if g['user_alignment'] is not None:
+                g['mafft_add_fasta'] = g['user_alignment']
+                df = parser_pymol.add_coordinate_from_user_alignment(df=df, user_alignment=g['mafft_add_fasta'])
+            else:
+                g['mafft_add_fasta'] = g['pdb_outfile_base']+'.fa'
+                parser_pymol.write_mafft_alignment(g=g)
+                df = parser_pymol.add_coordinate_from_mafft_map(df=df, mafft_map_file='tmp.csubst.pdb_seq.fa.map')
             df = parser_pymol.add_pdb_residue_numbering(df=df)
             g['session_file_path'] = g['pdb_outfile_base']+'.pymol.pse'
             parser_pymol.write_pymol_session(df=df, g=g)
diff --git a/csubst/parser_pymol.py b/csubst/parser_pymol.py
@@ -2,6 +2,8 @@
 import pandas
 import pymol
 
+from Bio import SeqIO
+
 from io import StringIO
 import copy
 import os
@@ -26,7 +28,7 @@ def initialize_pymol(g):
         print('Loading PDB file: {}'.format(g['pdb']), flush=True)
         pymol.cmd.load(g['pdb'])
 
-def write_mafft_map(g):
+def write_mafft_alignment(g):
     tmp_pdb_fasta = 'tmp.csubst.pdb_seq.fa'
     mafft_map_file = tmp_pdb_fasta+'.map'
     if os.path.exists(mafft_map_file):
@@ -45,16 +47,19 @@ def write_mafft_map(g):
     out_mafft = subprocess.run(cmd_mafft, stdout=subprocess.PIPE)
     with open(g['mafft_add_fasta'], 'w') as f:
         f.write(out_mafft.stdout.decode('utf8'))
+    print('')
     for i in range(10):
         if os.path.exists(mafft_map_file):
-            print('mafft map file was generated.', flush=True)
+            print('MAFFT alignment file was generated: {}'.format(g['mafft_add_fasta']), flush=True)
             break
         else:
-            print('mafft map file not detected. Waiting {:} sec'.format(i+1), flush=True)
+            print('MAFFT alignment file not detected. Waiting {:} sec'.format(i+1), flush=True)
             time.sleep(1)
-    txt = 'CSUBST does not exclude poorly aligned regions. ' \
-          'Please carefully check {} before biological interpretation of substitution events.'
-    print(txt.format(g['mafft_add_fasta']), flush=True)
+    print('CSUBST does not exclude poorly aligned regions.', flush=True)
+    print('Please carefully check the MAFFT alignment file before biological interpretation of substitution events.', flush=True)
+    print('If manual adjustment is necessary, please correct the amino acid positions of database-derived sequences and use the updated MAFFT alignment file as input with --user_alignment.', flush=True)
+    print('The CSUBST input sequences (i.e., sequences in the file specified by --alignment_file) should not be modified at this stage.', flush=True)
+    print('', flush=True)
     if os.path.getsize(g['mafft_add_fasta'])==0:
         sys.stderr.write('File size of {} is 0. A wrong ID might be specified in --pdb.\n'.format(g['mafft_add_fasta']))
         sys.stderr.write('Exiting.\n')
@@ -91,9 +96,12 @@ def add_pdb_residue_numbering(df):
             if 'codon_site_'+key in df.columns:
                 df = pandas.merge(df, residue_numberings[key], on='codon_site_'+key, how='left')
                 df['codon_site_pdb_'+key] = df['codon_site_pdb_'+key].fillna(0).astype(int)
+    print('The column "codon_site_**ID**" indicates the positions of codons/amino acids in the sequence "**ID**" in the input alignment. 0 = missing site.')
+    print('The column "codon_site_pdb_**ID**" indicates the positions of codons/amino acids in the sequence "**ID**" in the PDB file. 0 = missing site.')
     return df
 
-def add_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map'):
+def add_coordinate_from_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map'):
+    print('Loading amino acid coordinates from: {}'.format(mafft_map_file), flush=True)
     with open(mafft_map_file, 'r') as f:
         map_str = f.read()
     map_list = map_str.split('>')[1:]
@@ -114,6 +122,39 @@ def add_mafft_map(df, mafft_map_file='tmp.csubst.pdb_seq.fa.map'):
             df['aa_'+seq_name] = df.loc[:,'aa_'+seq_name].fillna('')
     return df
 
+def add_coordinate_from_user_alignment(df, user_alignment):
+    print('Loading amino acid coordinates from: {}'.format(user_alignment), flush=True)
+    pdb_fasta = pymol.cmd.get_fastastr(selection='polymer.protein', state=-1, quiet=1)
+    tmp_pdb_fasta = 'tmp.csubst.pdb_seq.fa'
+    with open(tmp_pdb_fasta, 'w') as f:
+        f.write(pdb_fasta)
+    pdb_seqs = list(SeqIO.parse(open(tmp_pdb_fasta, 'r'), 'fasta'))
+    user_seqs = list(SeqIO.parse(open(user_alignment, 'r'), 'fasta'))
+    for user_seq in user_seqs:
+        for pdb_seq in pdb_seqs:
+            if user_seq.name!=pdb_seq.name:
+                continue
+            user_seq_str = str(user_seq.seq).replace('\n', '')
+            pdb_seq_str = str(pdb_seq.seq).replace('\n', '')
+            user_seq_counter = 0
+            pdb_seq_counter = 0
+            txt = 'The alignment length should match between --alignment_file ({} sites) and --user_alignment ({} sites)'
+            assert len(user_seq_str)==df.shape[0], txt.format(df.shape[0], len(user_seq_str))
+            df['aa_' + user_seq.name] = ''
+            df['codon_site_' + user_seq.name] = 0
+            while user_seq_counter <= df.shape[0]-1:
+                if user_seq_str[user_seq_counter]=='-':
+                    user_seq_counter += 1
+                    continue
+                if user_seq_str[user_seq_counter]==pdb_seq_str[pdb_seq_counter]:
+                    df.at[user_seq_counter, 'aa_' + user_seq.name] = user_seq_str[user_seq_counter]
+                    df.at[user_seq_counter, 'codon_site_' + user_seq.name] = pdb_seq_counter + 1
+                    user_seq_counter += 1
+                    pdb_seq_counter += 1
+                else:
+                    pdb_seq_counter += 1
+    return df
+
 def calc_aa_identity(g):
     seqs = sequence.read_fasta(path=g['mafft_add_fasta'])
     seqnames = list(seqs.keys())
diff --git a/csubst/sequence.py b/csubst/sequence.py
@@ -59,7 +59,7 @@ def write_alignment(outfile, mode, g, leaf_only=False):
         aln_tmp += translate_state(nlabel, mode, g)
         aln_out += aln_tmp+'\n'
     with open(outfile, 'w') as f:
-        print('Writing alignment:', outfile, flush=True)
+        print('Writing sequence alignment:', outfile, flush=True)
         f.write(aln_out)
 
 def get_state_index(state, input_state, ambiguous_table):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.2.4'`
	`1`	`+__version__ = '1.3.0'`