Merge pull request #517 from AllenInstitute/analysis

Running bootstraps on hpc
AllenInstitute · Nov 16, 2022 · 1f97929 · 1f97929
2 parents 2299e01 + dce8276
commit 1f97929
Show file tree

Hide file tree

Showing 5 changed files with 282 additions and 27 deletions.
diff --git a/scripts/deploy_running_bootstraps.py b/scripts/deploy_running_bootstraps.py
@@ -0,0 +1,98 @@
+import os
+import argparse
+import time
+import pandas as pd
+
+from simple_slurm import Slurm
+import visual_behavior_glm.PSTH as psth
+
+parser = argparse.ArgumentParser(description='deploy glm fits to cluster')
+parser.add_argument('--env-path', type=str, default='visual_behavior', metavar='path to conda environment to use')
+
+
+def already_fit(row):
+    filename = psth.get_hierarchy_filename(
+        row.cell_type,
+        row.response,
+        row['data'],
+        'all',
+        row.nboots,
+        ['visual_strategy_session'],
+        'running_{}'.format(row.bin_num))
+    return os.path.exists(filename) 
+
+def get_bootstrap_jobs():
+    nboots=10000
+    base_jobs = [
+        {'cell_type':'exc','response':'image','data':'events','nboots':nboots}, 
+        {'cell_type':'sst','response':'image','data':'events','nboots':nboots},
+        {'cell_type':'vip','response':'image','data':'events','nboots':nboots}, 
+        {'cell_type':'exc','response':'omission','data':'events','nboots':nboots},
+        {'cell_type':'sst','response':'omission','data':'events','nboots':nboots},
+        {'cell_type':'vip','response':'omission','data':'events','nboots':nboots}
+        ]
+    jobs = []
+    for b in range(-5,21):
+        for j in base_jobs:
+            temp = j.copy()
+            temp['bin_num'] = b
+            jobs.append(temp)
+    jobs = pd.DataFrame(jobs)
+    return jobs
+
+def make_job_string(row):
+    arg_string = ''
+    arg_string += ' --cell_type {}'.format(row.cell_type)
+    arg_string += ' --response {}'.format(row.response)
+    arg_string += ' --data {}'.format(row['data'])
+    arg_string += ' --bin_num {}'.format(row.bin_num)
+    arg_string += ' --nboots {}'.format(row.nboots) 
+    return arg_string
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    python_executable = "{}/bin/python".format(args.env_path)
+    print('python executable = {}'.format(python_executable))
+    python_file = "/home/alex.piet/codebase/GLM/visual_behavior_glm/scripts/running_bootstrap.py"  
+    stdout_basedir = "/allen/programs/braintv/workgroups/nc-ophys/visual_behavior/ophys_glm" 
+    stdout_location = os.path.join(stdout_basedir, 'job_records_running_bootstraps')
+    if not os.path.exists(stdout_location):
+        print('making folder {}'.format(stdout_location))
+        os.mkdir(stdout_location)
+    print('stdout files will be at {}'.format(stdout_location))
+
+    job_count = 0
+    jobs = get_bootstrap_jobs()
+
+    for index, row in jobs.iterrows():
+        if not already_fit(row):
+            job_count += 1
+            args_string = make_job_string(row)
+            print('starting cluster job. job count = {}'.format(job_count))
+            print('   ' + args_string)
+            job_title = 'bootstraps'
+            walltime = '24:00:00'
+            mem = '100gb'
+            job_id = Slurm.JOB_ARRAY_ID
+            job_array_id = Slurm.JOB_ARRAY_MASTER_ID
+            output = stdout_location+"/"+str(job_array_id)+"_"+str(job_id)+"_bootstrap.out"
+
+            # instantiate a SLURM object
+            slurm = Slurm(
+                cpus_per_task=4,
+                job_name=job_title,
+                time=walltime,
+                mem=mem,
+                output= output,
+                partition="braintv"
+            )
+
+            slurm.sbatch('{} {} {}'.format(
+                    python_executable,
+                    python_file,
+                    args_string,
+                )
+            )
+            time.sleep(0.001)
+
+
diff --git a/scripts/deploy_running_bootstraps.sh b/scripts/deploy_running_bootstraps.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Make sure you run conda activate <env> first
+# to run this from an environment where the allenSDK is installed
+
+python deploy_running_bootstraps.py --env-path /home/alex.piet/codebase/miniconda3/envs/visbeh 
diff --git a/scripts/running_bootstrap.py b/scripts/running_bootstrap.py
@@ -0,0 +1,64 @@
+import visual_behavior_glm.PSTH as psth
+import psy_output_tools as po
+import argparse
+
+parser = argparse.ArgumentParser(description='compute hierarchy bootstraps')
+parser.add_argument(
+    '--cell_type', 
+    type=str, 
+    default='',
+    metavar='cell',
+    help='cell_type'
+)
+parser.add_argument(
+    '--response', 
+    type=str, 
+    default='',
+    metavar='response',
+    help='response'
+)
+
+parser.add_argument(
+    '--data', 
+    type=str, 
+    default='',
+    metavar='',
+    help='data'
+)
+
+parser.add_argument(
+    '--nboots', 
+    type=int, 
+    default=0,
+    metavar='',
+    help='data'
+)
+
+parser.add_argument(
+    '--bin_num', 
+    type=int, 
+    default=0,
+    metavar='',
+    help='data'
+)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print('Starting bootstrap with the following inputs')
+    print('cell_type {}'.format(args.cell_type))
+    print('response  {}'.format(args.response))
+    print('data      {}'.format(args.data))
+    print('nboots    {}'.format(args.nboots))
+    print('bin_num   {}'.format(args.bin_num))
+    print('')
+    summary_df = po.get_ophys_summary_table(21)
+    psth.load_df_and_compute_running(
+        summary_df,
+        args.cell_type,
+        args.response,
+        args.data,
+        args.nboots,
+        args.bin_num
+        )
+    print('finished') 
diff --git a/visual_behavior_glm/PSTH.py b/visual_behavior_glm/PSTH.py
@@ -1,4 +1,5 @@
 import os
+import pickle
 import numpy as np
 import pandas as pd
 import seaborn as sns
@@ -557,6 +558,63 @@ def plot_strategy_histogram(full_df,cre,condition,experience_level,savefig=False
 
     return ax
 
+def compute_running_bootstrap_bin(df, condition, cell_type, bin_num, nboots=10000,
+    data='events'):
+
+    filename = get_hierarchy_filename(cell_type,condition,data,'all',nboots,
+        ['visual_strategy_session'],'running_{}'.format(bin_num))  
+    if os.path.isfile(filename):
+        print('Already computed {}'.format(bin_num))
+        return 
+
+    # Figure out bins
+    if condition =='omission':
+        bin_width=5        
+    elif condition =='image':
+        bin_width=5#2   
+    df['running_bins'] = np.floor(df['running_speed']/bin_width)
+    bins = np.sort(df['running_bins'].unique())  
+
+    # Check if any data 
+    if bin_num not in bins:
+        print('No data for this running bin')
+        return 
+
+    # Compute bootstrap for this bin 
+    temp = df.query('running_bins == @bin_num')[['visual_strategy_session',
+        'ophys_experiment_id','cell_specimen_id','response']]
+    means = hb.bootstrap(temp, levels=['visual_strategy_session',
+        'ophys_experiment_id','cell_specimen_id'],nboots=nboots)
+    if (True in means) & (False in means):
+        diff = np.array(means[True]) - np.array(means[False])
+        pboot = np.sum(diff<0)/len(diff)
+        visual = np.std(means[True])
+        timing = np.std(means[False])
+    else:
+        pboot = 1
+        if (True in means):
+            visual = np.std(means[True])
+        else:
+            visual = 0 
+        if (False in means):
+            timing = np.std(means[False])
+        else:
+            timing = 0 
+    bootstrap = {
+        'running_bin':bin_num,
+        'p_boot':pboot,
+        'visual_sem':visual,
+        'timing_sem':timing,
+        'n_boots':nboots,
+        }
+
+    # Save to file
+    filename = get_hierarchy_filename(cell_type,condition,data,'all',nboots,
+        ['visual_strategy_session'],'running_{}'.format(bin_num))
+    with open(filename,'wb') as handle:
+        pickle.dump(bootstrap, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    print('bin saved to {}'.format(filename)) 
+
 def compute_running_bootstrap(df,condition,cell_type,nboots=10000,data='events'):
     if condition =='omission':
         bin_width=5        
@@ -565,34 +623,45 @@ def compute_running_bootstrap(df,condition,cell_type,nboots=10000,data='events')
     df['running_bins'] = np.floor(df['running_speed']/bin_width)
 
     bootstraps = []
-    bins = df['running_bins'].unique()
+    bins = np.sort(df['running_bins'].unique())
     for b in bins:
-        temp = df.query('running_bins == @b')[['visual_strategy_session',
-            'ophys_experiment_id','cell_specimen_id','response']]
-        means = hb.bootstrap(temp, levels=['visual_strategy_session',
-            'ophys_experiment_id','cell_specimen_id'],nboots=nboots)
-        if (True in means) & (False in means):
-            diff = np.array(means[True]) - np.array(means[False])
-            pboot = np.sum(diff<0)/len(diff)
-            visual = np.std(means[True])
-            timing = np.std(means[False])
+        # First check if this running bin has already been computed
+        filename = get_hierarchy_filename(cell_type,condition,data,'all',nboots,
+            ['visual_strategy_session'],'running_{}'.format(int(b)))
+        if os.path.isfile(filename):
+            # Load this bin
+            with open(filename,'rb') as handle:
+                this_boot = pickle.load(handle)
+            print('loading this boot from file: {}'.format(b))
+            bootstraps.append(this_boot) 
         else:
-            pboot = 1
-            if (True in means):
+            print('Need to compute this bin: {}'.format(b))
+            temp = df.query('running_bins == @b')[['visual_strategy_session',
+                'ophys_experiment_id','cell_specimen_id','response']]
+            means = hb.bootstrap(temp, levels=['visual_strategy_session',
+                'ophys_experiment_id','cell_specimen_id'],nboots=nboots)
+            if (True in means) & (False in means):
+                diff = np.array(means[True]) - np.array(means[False])
+                pboot = np.sum(diff<0)/len(diff)
                 visual = np.std(means[True])
-            else:
-                visual = 0 
-            if (False in means):
                 timing = np.std(means[False])
             else:
-                timing = 0 
-        bootstraps.append({
-            'running_bin':b,
-            'p_boot':pboot,
-            'visual_sem':visual,
-            'timing_sem':timing,
-            'n_boots':nboots,
-            })
+                pboot = 1
+                if (True in means):
+                    visual = np.std(means[True])
+                else:
+                    visual = 0 
+                if (False in means):
+                    timing = np.std(means[False])
+                else:
+                    timing = 0 
+            bootstraps.append({
+                'running_bin':b,
+                'p_boot':pboot,
+                'visual_sem':visual,
+                'timing_sem':timing,
+                'n_boots':nboots,
+                })
 
     # Convert to dataframe, do multiple comparisons corrections
     bootstraps = pd.DataFrame(bootstraps)
@@ -603,21 +672,23 @@ def compute_running_bootstrap(df,condition,cell_type,nboots=10000,data='events')
     bootstraps = bootstraps.drop(columns=['location'])
 
     # Save file
-    filepath = get_hierarchy_filename(cell_type,condition,data,'all',nboots,['visual_strategy_session'],'running')
+    filepath = get_hierarchy_filename(cell_type,condition,data,'all',nboots,
+        ['visual_strategy_session'],'running')
     print('saving bootstraps to: '+filepath)
     bootstraps.to_feather(filepath)
     return bootstraps
 
 def get_running_bootstraps(cell_type, condition,data,nboots):
-    filepath = get_hierarchy_filename(cell_type,condition,data,'all',nboots,['visual_strategy_session'],'running') 
+    filepath = get_hierarchy_filename(cell_type,condition,data,'all',nboots,
+        ['visual_strategy_session'],'running') 
     if os.path.isfile(filepath):
         bootstraps = pd.read_feather(filepath)
         return bootstraps
     else:
         print('file not found, compute the running bootstraps first')
 
-def running_responses(df,condition, cre='vip', bootstraps=None,savefig=False,data='events',
-    split='visual_strategy_session'):
+def running_responses(df, condition, cre='vip', bootstraps=None, savefig=False,
+    data='events', split='visual_strategy_session'):
     if condition =='omission':
         bin_width=5        
     elif condition =='image':
@@ -698,6 +769,19 @@ def running_responses(df,condition, cre='vip', bootstraps=None,savefig=False,dat
         print('Figure saved to {}'.format(filename))
         plt.savefig(filename) 
 
+def load_df_and_compute_running(summary_df, cell_type, response, data, nboots, bin_num):
+    mapper = {
+        'exc':'Slc17a7-IRES2-Cre',
+        'sst':'Sst-IRES-Cre',
+        'vip':'Vip-IRES-Cre'
+        }
+    if response == 'image':
+        df = load_image_df(summary_df, mapper[cell_type], data)
+    elif response == 'omission':
+        df = load_omission_df(summary_df, mapper[cell_type], data)
+    compute_running_bootstrap_bin(df,response, cell_type, bin_num, nboots=nboots)
+
+
 def load_df_and_compute_hierarchy(summary_df, cell_type, response, data, depth, nboots, 
     splits, query='', extra=''):
     mapper = {

diff --git a/visual_behavior_glm/neuro_dev.py b/visual_behavior_glm/neuro_dev.py
@@ -108,6 +108,10 @@
 vip_omission = psth.load_omission_df(summary_df, cre='Vip-IRES-Cre',data='events')
 vip_image = psth.load_image_df(summary_df, cre='Vip-IRES-Cre',data='events')
 
+# To make things work on the HPC, you can compute just one running bin
+psth.compute_running_bootstrap_bin(vip_omission,'omission','vip',bin_num,data='events',
+    nboots=nboots)
+
 # Generate bootstrapped errorbars:
 bootstraps_omission = psth.compute_running_bootstrap(vip_omission,'omission','vip',
     data='events',nboots=nboots)