gnina
diff --git a/Diff for: ‎counterexample_generation_jobs.py
+103 b/Diff for: ‎counterexample_generation_jobs.py
+103
diff --git a/Diff for: ‎generate_counterexample_typeslines.py
+252 b/Diff for: ‎generate_counterexample_typeslines.py
+252
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+'''
+This is a script which will generate a file of commands for gnina to use cnn_minimze to generate iterative training poses.
+
+ASSUMPTIONS
+	  i) assumes all receptors are PDB files IE end in .pdb
+	 ii) Assumes all docked poses or outputs from gnina will be SDF files.
+	iii) The crystal ligand filenames are formatted PDBid_LignameLIGSUFFIX
+	 iv) assumes file format is ROOT/POCKET/FILES
+	  v) Will generate a line for every identified crystal ligand with every identified receptor in POCKET -- i.e. crossdocking.
+	 vi) Assumes ligands will have the name of their corresponding crystal ligand file present in their filename. (This is especially important is using docked poses.)
+	vii) Will generate  REC_LIG_lig_it#_docked.sdf files as output. (If using docked poses as well, they will have their name will have extra _it#_ parts in it, the current it# will be the leftmost one)
+'''
+
+
+import os, argparse, glob, re
+
+def get_receptors(root,rec_id):
+	all_pdbs=glob.glob(root+'*.pdb')
+	identifier=re.compile(rec_id)
+	recs=[x for x in all_pdbs if re.match(identifier,x.split('/')[-1])]
+	return recs
+
+def get_ligands(root,lig_suffix):
+	all_ligs=glob.glob(root+'*'+lig_suffix)
+	return all_ligs
+
+def generate_line(receptor,ligand,outname,crystal_ligand,seed,num_modes,builtin_cnn,supplied_cnn=None,supplied_weights=None):
+	if bool(supplied_cnn) and bool(supplied_weights):
+		return(f'gnina -r {receptor} -l {ligand} -o {outname} --autobox_ligand {crystal_ligand} --seed {seed} --gpu --minimize --cnn_scoring refinement --num_modes {num_modes} --cnn_model {supplied_cnn} --cnn_weights {supplied_weights}\n')
+	else:
+		return(f'gnina -r {receptor} -l {ligand} -o {outname} --autobox_ligand {crystal_ligand} --seed {seed} --gpu --minimize --cnn_scoring refinement --num_modes {num_modes} --cnn {builtin_cnn}\n')
+
+#grabbing the arguments
+parser=argparse.ArgumentParser(description='Create cnn_minimize jobs for a dataset. Assumes dataset file structure is <ROOT>/<Identifier>/<FILES>')
+parser.add_argument('-o','--outfile',type=str,required=True,help='Name for gnina job commands output file.')
+parser.add_argument('-r','--root',default='./',help='ROOT for data directory structure. Defaults to current working directory.')
+parser.add_argument('-ri','--rec_id',default='...._._rec.pdb',help='Regular expression to identify the receptor PDB. Defaults to ...._._rec.pdb')
+parser.add_argument('-cs','--crystal_suffix',default='_lig.pdb',help='Expresssion to glob the crystal ligand PDB. Defaults to _lig.pdb. Assumes filename is PDBid_LignameLIGSUFFIX')
+parser.add_argument('-ds','--docked_suffix',default='_tt_docked.sdf',help='Expression to glob docked poses. These contain the poses that need to be minimized. Default is "_tt_docked.sdf"')
+parser.add_argument('-i','--iteration',type=int,required=True,help='Sets what iteration number we are doing. Adds _it#_docked.sdf to the output file for the gnina job line.')
+parser.add_argument('--num_modes',type=int,default=20,help='Sets the --num_modes argument for the gnina command. Defaults to 20.')
+parser.add_argument('--cnn',type=str, default='dense',help='Sets the --cnn command for the gnina command. Defaults to dense. Must be dense, general_default2018, or crossdock_default2018.')
+parser.add_argument('--cnn_model',type=str,default=None,help='Override --cnn with a user provided caffe model file. If used, requires the user to pass in a weights file as well.')
+parser.add_argument('--cnn_weights',type=str,default=None,help='The weights file to use with the supplied caffemodel file.')
+parser.add_argument('--seed',default=42,type=int,help='Seed for the gnina commands. Defaults to 42')
+parser.add_argument('--dirs',type=str,default=None,help='Supplied directories to do a subset of the dataset. Default behavior is to do every directory.')
+args=parser.parse_args()
+
+#double checking that the arguments are compatible
+if args.cnn_model:
+	assert bool(args.cnn_weights),"Didn't set cnn_weights to go with cnn_model"
+else:
+	assert args.cnn in set(['dense','general_default2018','crossdock_default2018']),"Must have built-in cnn be dense, general_default2018, or crossdock_default2018"
+assert args.num_modes>1,"Need to set num_modes to a positive integer."
+assert args.seed>0,"Need a positive seed."
+assert args.iteration>0,"Need an iteration number >=1."
+
+
+#now we begin.
+#Step 1 -- assemble all of the directories that we will be using.
+dataroot=sys.path.join(args.root,'')
+todo=glob.glob(dataroot+'*/')
+
+if args.dirs:
+	subdirs=open(args.dirs).readlines()
+	subdirs=[x.rstrip() for x in subdirs]
+	subdirs=set(subdirs)
+	todo=[x for x in todo if x.split('/')[-2] in subdirs]
+
+#Step 2 -- main loop of the script
+#set the iteration plugin variable
+itname='_it'+str(args.iteration)
+
+#	 We loop over the pockets
+#TODO -- change to only do the docked poses
+with open(args.outfile,'w') as outfile:
+	for pocket_root in todo:
+		#grab the receptors
+		recs=get_receptors(pocket_root,args.rec_id)
+
+		#grab all of the crystal ligands
+		cr_ligs=get_ligands(pocket_root,args.crystal_suffix)
+		
+		#Grab all of the docked poses
+		ligs=get_ligands(pocket_root,args.docked_suffix)
+		for r in recs:
+			for cl in cr_ligs:
+				#determine which ligands will work -- IE which ligands have the crystal ligand indentifier in their name, and which ligands have the receptor in their name.
+				lig_todo=[l for l in ligs if cl.split('/')[-1].split(args.crystal_suffix)[0] in l]
+				lig_todo=[l for l in lig_todo if r.split('/')[-1].split('.pdb')[0] in l]
+				for ligname in lig_todo:
+					#generate the output filename
+					#if args.docked_suffix and args.docked_suffix in ligname:
+					outname=ligname.replace(args.docked_suffix,itname+args.docked_suffix)
+					#else:
+					#	rec_part=r.split('.pdb')[0]+'_'
+					#	lig_part=ligname.split('/')[-1].split(args.crystal_suffix)[0]
+					#	outname=rec_part+lig_part+'_lig_'+itname+'docked.sdf'
+
+					outfile.write(generate_line(receptor=r,ligand=ligname,outname=outname,crystal_ligand=cl,seed=args.seed,num_modes=args.num_modes,builtin_cnn=args.cnn,supplied_cnn=args.cnn_model,supplied_weights=args.cnn_weights))
+
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+
+'''
+This script will generate the lines for a new types file with the iterative poses generated from counterexample_generation_jobs.py
+
+!!WARNING!!
+Part of this process is to determine which newly generated poses are NOT REDUNDANT with the previously generated ones.
+This requires an O(n^2) calculation to calculate the RMSD between every pose...
+Ergo, this calculation depending on the number of poses in a given pocket could take a very long time.
+This script also works on all ligands present in the pocket, so there is the potential for multiple O(n^2) calculations to take place.
+
+We have done our best to avoid needless calculations, but this is why we generate the lines for each pocket independently
+
+ASSUMPTIONS:
+	i) Poses with <2 RMSD to the crystal pose will be labeled as positive poses
+	ii) you have obrms installed, and can run it from your commandline
+	iii) the jobfile provided as input contains the full PATH to the files specified.
+	iv) the gninatypes files (generated by gninatyper) for the poses in args.input have ALREADY BEEN generated.
+	v) The crystal ligand files are formatted PDBid_LignameLIGSUFFIX
+	vi) The OLD sdf file with the unique poses is named LignameOLDUNIQUESUFFIX
+
+INPUT:
+	i) The path to the pocket you are working on
+	ii) the threshold RMSD to determine if they are the same pose
+	iii) the name for the txt file that contains the lines to write (will be written in the POCKET DIRECTORY)
+	iv) the suffix of the NEW sdf file that contains all of the unique poses
+	v) the commands file generated from counterexample_generation_jobs.py
+	vi) --OPTIONAL-- the suffix of the OLD sdf file that contains all of the unique poses
+
+OUTPUT:
+	==Normal==
+	i) the typesfile lines to add to generate the new types file
+	ii) A SDF file containing all of the unique poses for a given ligand -- named LignameUNIQUE_SUFFIX
+	iii) a ___.sdf file which will be the working file for obrms.
+'''
+
+import argparse, re, subprocess, os, sys
+import pandas as pd
+from rdkit.Chem import AllChem as Chem
+
+def check_exists(filename):
+	if os.path.isfile(filename) and os.path.getsize(filename)>0:
+		return True
+	else:
+		return False
+
+def get_pocket_lines(filename,pocket):
+	'''
+	This function reads the lines from filename, and returns only the lines which contain pocket in them.
+	'''
+	all_lines=open(filename).readlines()
+	lines=[x for x in all_lines if pocket in x]
+	return lines
+
+def calc_ligand_dic(lines,ligand_suffix):
+	'''
+	This function will parse the input list of lines and construct 2 dictionaries
+		 1) ligand name -> [docked files with that ligand]
+		 2) docked_filename -> crystal_file for that pose
+	'''
+	data={}
+	docked_lookup={}
+	for line in lines:
+		#1) Getting the crystal ligand file
+		ligfile=re.split('--autobox_ligand ',line)[1].split()[0]
+
+		#2) Getting the name of the ligand ** here we assume the ligfile is PATH/<PDBid>_<ligname><LIGSUFFIX>
+		ligname=ligfile.split('/')[-1].split(ligand_suffix)[0].split('_')[1]
+
+		#3) Check if ligname in data
+		if ligname not in data:
+			data[ligname]=[]
+
+		#4) grabbing the docked files
+		outfile=re.split('-o ',line)[1].split()[0]
+
+		#5) Adding these files to their corresponding places in the dictionary
+		data[ligname].append(outfile)
+		docked_lookup[outfile]=ligfile
+
+	return data, docked_lookup
+
+def run_obrms(ligand_file,crystal_file):
+	'''
+	This function returns a list of rmsds of the docked ligand file to the crystal file. The list is in the order of the poses.
+	'''
+
+	rmsds=subprocess.check_output(f'obrms {ligand_file} {crystal_file}',shell=True)
+	rmsds=str(rmsds,'utf-8').rstrip().split('\n')
+	rmsds=[float(x.split()[-1]) for x in rmsds]
+	return rmsds
+
+def get_lines_towrite(crystal_lookup,list_of_docked,affinity_lookup,crystal_suffix):
+	'''
+	This function will calculate the RMSD of every input pose, to the provided crystal pose.
+
+	returns a dictionary of lines -->  'docked pose filename':[lines to write]
+	'''
+	lines={}
+
+	for docked in list_of_docked:
+		#Figure out affinity.
+		affinity=0.0
+		crystal=crystal_lookup[docked]
+		cr_lookup=crystal.split(crystal_suffix)[0]
+		if cr_lookup in affinity_lookup:
+			affinity=affinity_lookup
+		print(docked,crystal)
+		rmsds=run_obrms(docked,crystal)
+		counter=0
+		lines[docked]=[]
+		for r in rmsds:
+			if r < 2:
+				label='1'
+				neg_aff=''
+			else:
+				label='0'
+				neg_aff='-'
+
+			rec_gninatypes=docked.split('rec')[0]+'rec_0.gninatypes'
+			lig_gninatypes=docked.replace('.sdf','_'+str(counter)+'.gninatypes')
+			lines[docked].append(f'{label} {neg_aff}{affinity} {r} {rec_gninatypes} {lig_gninatypes}\n')
+			counter+=1
+	return lines
+
+def run_obrms_cross(filename):
+	'''
+	This function returns a pandas dataframe of the RMSD between every pose and every other pose, which is generated using obrms -x
+	'''
+
+	csv=subprocess.check_output('obrms -x '+filename,shell=True)
+	csv=str(csv,'utf-8').rstrip().split('\n')
+	data=pd.DataFrame([x.split(',')[1:] for x in csv],dtype=float)
+	return data
+
+
+parser=argparse.ArgumentParser(description='Create lines to add to types files from counterexample generation. Assumes data file structure is ROOT/POCKET/FILES.')
+parser.add_argument('-p','--pocket',type=str,required=True,help='Name of the pocket that you will be generating the lines for.')
+parser.add_argument('-r','--root',type=str,required=True,help='PATH to the ROOT of the pockets.')
+parser.add_argument('-i','--input',type=str,required=True,help='File that is output from counterexample_generation_jobs.py')
+parser.add_argument('-cs','--crystal_suffix',default='_lig.pdb',help='Expresssion to glob the crystal ligand PDB. Defaults to _lig.pdb. Needs to match what was used with counterexample_generation_jobs.py')
+parser.add_argument('--old_unique_suffix',type=str,default=None,help='Suffix for the unique ligand sdf file from a previous run. If set we will load that in and add to it. Default behavior is to generate it from provided input file.')
+parser.add_argument('-us','--unique_suffix',type=str,default='_it1___.sdf',help='Suffix for the unique ligand sdf file for this run. Defaults to _it1___.sdf. One will be created for each ligand in the pocket.')
+parser.add_argument('--unique_threshold',default=0.25,help='RMSD threshold for unique poses. IE poses with RMSD > thresh are considered unique. Defaults to 0.25.')
+parser.add_argument('--lower_confusing_threshold',default=0.5,help='CNNscore threshold for identifying confusing good poses. Score < thresh & under 2RMSD is kept and labelled 1. 0<thresh<1. Default 0.5')
+parser.add_argument('--upper_confusing_threshold',default=0.9,help='CNNscore threshold for identifying confusing poor poses. If CNNscore > thresh & over 2RMSD pose is kept and labelled 0. lower<thresh<1. Default 0.9')
+parser.add_argument('-o','--outname',type=str,required=True,help='Name of the text file to write the new lines in. DO NOT WRITE THE FULL PATH!')
+parser.add_argument('-a','--affinity_lookup',default='pdbbind2017_affs.txt',help='File mapping the PDBid and ligname of the ligand to its pK value. Assmes space delimited "PDBid ligname pK". Defaults to pdbbind2017_affs.txt')
+args=parser.parse_args()
+
+#Setting the myroot and root remove variable for use in the script
+myroot=os.path.join(args.root,args.pocket,'')
+root_remove=os.path.join(args.root,'')
+
+
+#sanity check threshold
+assert args.unique_threshold > 0, "Unique RMSD threshold needs to be positive"
+assert 0<args.lower_confusing_threshold <1, "Lower_confusing_threshold needs to be in (0,1)"
+assert args.lower_confusing_threshold<args.upper_confusing_threshold<1, "Upper_confusing_threshold needs to be in (lower_confusing_threshold,1)"
+
+#generating our affinity lookup dictionary
+affinity_lookup={}
+with open(args.affinity_lookup) as infile:
+	for line in infile:
+		items=line.split()
+		key=items[0]+'_'+items[1]
+		val=items[2]
+		affinity_lookup[key]=val
+
+#first we will generate the dictionary for the ligand - poses we will use.
+tocheck=get_pocket_lines(args.input, args.pocket)
+datadic, docked_to_crystal_lookup=calc_ligand_dic(tocheck,args.crystal_suffix)
+
+#main loop of the script
+with open(myroot+args.outname,'w') as outfile:
+	#loop over the ligands
+	for cr_name, list_o_ligs in datadic.items():
+		if cr_name!='iqz':
+			continue
+		#0)  Make sure that the working sdf is free.
+		sdf_name=myroot+'___.sdf'
+		sdf_tmp=myroot+'___tmp.sdf'
+		#if this "___sdf" file already exists, we need to delete it and make a new one.
+		if check_exists(sdf_name):
+			os.remove(sdf_name)
+
+		#1) Figure out ALL of the lines to write
+		line_dic=get_lines_towrite(crystal_lookup=docked_to_crystal_lookup,list_of_docked=list_o_ligs,affinity_lookup=affinity_lookup,crystal_suffix=args.crystal_suffix)
+
+		#2) Set up the 'working sdf' for the obrms -x calculations, consisting of the confusing examples + any possible previously generated examples
+		# i) iterate over the possible lines for this ligand, keep only the confusing ones,
+		#         and write the confusing poses into the working sdf file.
+
+		w=Chem.SDWriter(sdf_name)
+		keys=list(line_dic.keys())
+		for key in keys:
+			kept_lines=[]
+			supply=Chem.SDMolSupplier(key,sanitize=False)
+			for i,mol in enumerate(supply):
+				curr_line=line_dic[key][i]
+				score=mol.GetProp('CNNscore')
+				label=curr_line.split()[0]
+				#if scored "well", but was a bad pose
+				if float(score) > args.upper_confusing_threshold and label=='0':
+					kept_lines.append(curr_line)
+					w.write(mol)
+				#or if scored "poor", but was a good pose
+				elif float(score) < args.lower_confusing_threshold and label=='1':
+					kept_lines.append(curr_line)
+					w.write(mol)
+			#after the lines have been checked, we overwrite and only store the lines we kept.
+			line_dic[key]=kept_lines
+		w=None
+
+		# ii) Prepend ___.sdf with the previously existing unique poses sdf
+		offset=0
+
+		if args.old_unique_suffix:
+			print('Prepending existing similarity sdf to working sdf file')
+			old_sdfname=myroot+cr_name+args.old_unique_suffix
+			supply=Chem.SDMolSupplier(old_sdfname,sanitize=False)
+			offset=len(supply)
+			subprocess.check_call('mv %s %s'%(sdf_name,sdf_tmp),shell=True)
+			subprocess.check_call('cat %s %s > %s'%(old_sdfname,sdf_tmp,sdf_name),shell=True)
+
+		#3) run obrms -x working_sdf to calculate the rmsd between each pose. This is the O(n^2) calculation
+		unique_data=run_obrms_cross(sdf_name)
+
+		#4) determine the newly found "unique" poses
+		assignments={}
+		for (r,row) in unique_data.iterrows():
+			if r not in assignments:
+				for simi in row[row<args.unique_threshold].index:
+					if simi not in assignments:
+						assignments[simi]=r
+
+		to_remove=set([k for (k,v) in assignments.items() if k!=v])
+		#5) write the remaining lines for the newly found "unique" poses.
+		counter=offset
+		for key in keys:
+			for line in line_dic[key]:
+				if counter not in to_remove:
+					outfile.write(line.replace(root_remove,''))
+				counter+=1
+
+		#6) Write out the new "uniques" sdf file to allow for easier future generation
+		new_unique_sdfname=myroot+cr_name+args.unique_suffix
+		w=Chem.SDWriter(new_unique_sdfname)
+		supply=Chem.SDMolSupplier(sdf_name,sanitize=False)
+		for i,mol in enumerate(supply):
+			if i not in to_remove:
+				w.write(mol)