supported single branch mode in csubst site

kfuku52 · kfuku52 · commit 93f1b6757dd3 · 2022-06-21T14:32:54.000+02:00
diff --git a/csubst/__init__.py b/csubst/__init__.py
@@ -1 +1 @@
-__version__ = '1.0.8'
+__version__ = '1.1.0'
diff --git a/csubst/csubst b/csubst/csubst
@@ -134,11 +134,11 @@ if __name__ == "__main__":
     psr_fg.add_argument('--mg_sister_stem_only', metavar='yes|no', default='yes', type=strtobool,
                     help='default=%(default)s: Set "yes" to exclude non-stem branches of sister lineages.')
     psr_fg.add_argument('--fg_clade_permutation', metavar='INT', default=0, type=int,
-                    help='default=%(default)s: Randomly select the same/similar number and size of clades as foreground '
+                    help='default=%(default)s: Experimental. Randomly select the same/similar number and size of clades as foreground '
                          'and run analysis N times to obtain a permutation-based P value of convergence. '
                          'At least 1000 is recommended.')
     psr_fg.add_argument('--min_clade_bin_count', metavar='INT', default=10, type=int,
-                    help='default=%(default)s: Minimum number of branches per bin for foreground clade permutation.')
+                    help='default=%(default)s: Experimental. Minimum number of branches per bin for foreground clade permutation. ')
 
     # dataset
     dataset = subparsers.add_parser('dataset', help='see `csubst dataset -h` or https://github.com/kfuku52/csubst/wiki', parents=[])
@@ -204,7 +204,7 @@ if __name__ == "__main__":
                            'Used only with --pdb.')
     site.add_argument('--remove_ligand', metavar='CODE1,CODE2,CODE3,...', default='', type=str,
                       help='default=%(default)s: Comma-delimited list of PDB ligand codes to be removed. '
-                           'e.g., "so4,po4,bme"'
+                           'e.g., "so4,po4,bme". '
                            'Used only with --pdb.')
     site.add_argument('--mask_subunit', metavar='yes|no', default='yes', type=strtobool,
                       help='default=%(default)s: Whether to mask unrelated subunits. '
@@ -292,7 +292,7 @@ if __name__ == "__main__":
                               'This option should be used with --omegaC_method "modelfree".')
     analyze.add_argument('--asrv', metavar='no|pool|sn|each|file', default='each', type=str,
                          choices=['no', 'pool', 'sn', 'each', 'file'],
-                         help='default=%(default)s: Correct among-site rate variation in omega/quantile calculation. '
+                         help='default=%(default)s: Experimental. Correct among-site rate variation in omega/quantile calculation. '
                               '"no", No ASRV, meaning the uniform rate among sites. '
                               '"pool", All categories of substitutions are pooled to calculate a single set of ASRV. '
                               '"sn", Synonymous and nonsynonymous substitutions are processed individually '
diff --git a/csubst/main_analyze.py b/csubst/main_analyze.py
@@ -120,7 +120,7 @@ def main_analyze(g):
     g = parser_misc.generate_intermediate_files(g)
     g = parser_misc.annotate_tree(g)
     g = parser_misc.read_input(g)
-    g,g['state_nuc'],g['state_cdn'],g['state_pep'] = parser_misc.prep_state(g)
+    g = parser_misc.prep_state(g)
 
     sequence.write_alignment('csubst_alignment_codon.fa', mode='codon', g=g)
     sequence.write_alignment('csubst_alignment_aa.fa', mode='aa', g=g)
diff --git a/csubst/main_site.py b/csubst/main_site.py
@@ -138,20 +138,30 @@ def add_substitution_labels(df, SN, sub_type, SN_colors, ax, g):
     return ax
 
 def plot_barchart(df, g):
-    sub_types = {
-        '_sub':'Branch-wise\nsubstitutions\nin the entire tree',
-        '_sub_':'Branch-wise\nsubstitutions\nin the targets',
-        'any2spe':'Posterior prob.\nof any2spe',
-        'any2dif':'Posterior prob.\nof any2dif',
-    }
-    SN_color_all = {
-        '_sub': {'N':'black', 'S':'gainsboro'},
-        '_sub_': {'N':'black', 'S':'gainsboro'},
-        'any2spe': {'N':'red', 'S':'gainsboro'},
-        'any2dif': {'N':'blue', 'S':'gainsboro'},
-    }
+    if g['single_branch_mode']:
+        sub_types = {
+            '_sub':'Branch-wise\nsubstitutions\nin the entire tree',
+            'any2any':'Branch-wise\nsubstitutions\nin the targets', # Identical to branch-wise substitutions in the targets
+        }
+        SN_color_all = {
+            '_sub': {'N':'black', 'S':'gainsboro'},
+            'any2any': {'N':'purple', 'S':'gainsboro'}, # Identical to branch-wise substitutions in the targets
+        }
+    else:
+        sub_types = {
+            '_sub':'Branch-wise\nsubstitutions\nin the entire tree',
+            '_sub_':'Branch-wise\nsubstitutions\nin the targets',
+            'any2spe':'Posterior prob.\nof any2spe',
+            'any2dif':'Posterior prob.\nof any2dif',
+        }
+        SN_color_all = {
+            '_sub': {'N':'black', 'S':'gainsboro'},
+            '_sub_': {'N':'black', 'S':'gainsboro'},
+            'any2spe': {'N':'red', 'S':'gainsboro'},
+            'any2dif': {'N':'blue', 'S':'gainsboro'},
+        }
     num_row = len(sub_types)
-    fig,axes = matplotlib.pyplot.subplots(nrows=num_row, ncols=1, figsize=(7.2, 4.8), sharex=True)
+    fig,axes = matplotlib.pyplot.subplots(nrows=num_row, ncols=1, figsize=(7.2, 1.2*len(sub_types)), sharex=True)
     axes = axes.flat
     i = 0
     NS_ymax = df.loc[:,['N_sub','S_sub']].sum(axis=1).max() + 0.5
@@ -687,27 +697,37 @@ def pdb_sequence_search(g):
             pdb_id = None
     return pdb_id
 
+def combinatorial2single_columns(df):
+    for SN in ['OCS','OCN']:
+        for anc in ['any','spe','dif']:
+            for des in ['any', 'spe', 'dif']:
+                col = SN+anc+'2'+des
+                if col in df.columns:
+                    df = df.drop(labels=col, axis=1)
+    return df
+
 def main_site(g):
     if g['pdb'] is not None:
         from csubst import parser_pymol
-    if g['pdb'] =='besthit':
-        g['run_pdb_sequence_search'] = True
-    else:
-        g['run_pdb_sequence_search'] = False
     print("Reading and parsing input files.", flush=True)
     g['codon_table'] = genetic_code.get_codon_table(ncbi_id=g['genetic_code'])
     g = tree.read_treefile(g)
     g = parser_misc.generate_intermediate_files(g)
     g = parser_misc.annotate_tree(g)
     g = parser_misc.read_input(g)
-    g,g['state_nuc'],g['state_cdn'],g['state_pep'] = parser_misc.prep_state(g)
+    g = parser_misc.prep_state(g)
     N_tensor = substitution.get_substitution_tensor(state_tensor=g['state_pep'], mode='asis', g=g, mmap_attr='N')
     N_tensor = substitution.apply_min_sub_pp(g, N_tensor)
     S_tensor = substitution.get_substitution_tensor(state_tensor=g['state_cdn'], mode='syn', g=g, mmap_attr='S')
     S_tensor = substitution.apply_min_sub_pp(g, S_tensor)
     g = add_branch_id_list(g)
     for branch_ids in g['branch_id_list']:
         print('\nProcessing branch_ids: {}'.format(','.join([ str(bid) for bid in branch_ids ])), flush=True)
+        if len(branch_ids)==1:
+            print('Single branch mode. Substitutions, rather than combinatorial substitutions, will be mapped.')
+            g['single_branch_mode'] = True
+        else:
+            g['single_branch_mode'] = False
         g['branch_ids'] = branch_ids
         g['site_outdir'] = './csubst_site.branch_id'+','.join([ str(bid) for bid in branch_ids ])
         if not os.path.exists(g['site_outdir']):
@@ -749,6 +769,8 @@ def main_site(g):
         else:
             out_file = 'csubst_site.'+re.sub('.pdb$', '', os.path.basename(g['pdb']))+'.tsv'
             out_path = os.path.join(g['site_outdir'], out_file)
+        if g['single_branch_mode']:
+            df = combinatorial2single_columns(df)
         df.to_csv(out_path, sep="\t", index=False, float_format=g['float_format'], chunksize=10000)
     print('To visualize the convergence probability on protein structure, please see: https://github.com/kfuku52/csubst/wiki')
     print('')
diff --git a/csubst/param.py b/csubst/param.py
@@ -37,6 +37,11 @@ def get_global_parameters(args):
         elif (g['float_type']==64):
             g['float_type'] = numpy.float64
             g['float_tol'] = 10**-9
+    if 'pdb' in g.keys():
+        if g['pdb']=='besthit':
+            g['run_pdb_sequence_search'] = True
+        else:
+            g['run_pdb_sequence_search'] = False
     if 'percent_biased_sub' in g.keys():
         assert (g['percent_biased_sub']<100), '--percent_biased_sub should be <100.'
     if 'float_digit' in g.keys():
diff --git a/csubst/parser_misc.py b/csubst/parser_misc.py
@@ -218,7 +218,10 @@ def prep_state(g):
         elif g['input_data_type'] == 'cdn':
             state_cdn = parser_iqtree.get_state_tensor(g)
             state_pep = sequence.cdn2pep_state(state_cdn=state_cdn, g=g)
-    return g,state_nuc,state_cdn,state_pep
+    g['state_nuc'] = state_nuc
+    g['state_cdn'] = state_cdn
+    g['state_pep'] = state_pep
+    return g
 
 def read_exchangeability_matrix(file, codon_orders):
     txt = pkg_resources.resource_string(__name__, file)
diff --git a/csubst/parser_pymol.py b/csubst/parser_pymol.py
@@ -3,7 +3,7 @@
 import pymol
 
 from io import StringIO
-#import itertools
+import copy
 import os
 import re
 import subprocess
@@ -214,13 +214,20 @@ def set_substitution_colors(df, g, object_names, N_sub_cols):
                     color_sites['OCNany2dif'].append(codon_site)
                 elif (prob_single_sub>=g['pymol_min_single_prob']):
                     color_sites['single_sub'].append(codon_site)
+            if g['single_branch_mode']:
+                color_sites['single_branch_N'] = copy.deepcopy(color_sites['OCNany2spe'])
+                del color_sites['OCNany2spe']
+                del color_sites['OCNany2dif']
+                del color_sites['single_sub']
             for key in color_sites.keys():
                 if key=='OCNany2spe':
                     hex_value = utility.rgb_to_hex(r=1, g=0, b=0)
                 elif key=='OCNany2dif':
                     hex_value = utility.rgb_to_hex(r=0, g=0, b=1)
                 elif key=='single_sub':
                     hex_value = utility.rgb_to_hex(r=0.4, g=0.4, b=0.4)
+                elif key=='single_branch_N':
+                    hex_value = utility.rgb_to_hex(r=0.5, g=0, b=0.5)
                 print('Amino acid sites with {} will be painted with {}.'.format(key, hex_value), flush=True)
                 txt_resi = '+'.join([str(site) for site in color_sites[key]])
                 cmd_color = "color {}, {} and chain {} and resi {}"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.0.8'`
	`1`	`+__version__ = '1.1.0'`