Skip to content

Commit be3eb10

Browse files
committed
new major revision of quickMD-nf workflow
1 parent 022d936 commit be3eb10

38 files changed

+224929
-21
lines changed

.gitignore

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,17 @@ logs/
44
.DS_Store
55
*.pyc
66
.vscode
7-
.nextflow*
7+
.nextflow*
8+
results_2024-11-22/
9+
results-4.0_2024-11-07/
10+
results-4.0_2024-11-08_75K/
11+
results-4.0_2024-11-08_old/
12+
results-4.0_2024-11-09/
13+
results-4.0_2024-11-13/
14+
results-4.0_2024-11-14/
15+
results-4.0_2024-11-15/
16+
results-4.0pH_2024-08-23/
17+
results-4.0pH_2024-10-28/
18+
results-7.4pH_2024-09-02/
19+
results-7.4pH_2024-10-25/
20+
results-old/

bin/esmfold_pdbgen.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python3
2+
"""esmfold_pdbgen
3+
4+
Converting FASTA file sequences to PDB files via use of ESMFold
5+
Usage:
6+
esmfold_pdbgen.py [--i=<fasta>]
7+
8+
Options:
9+
--i=<fasta> Input fasta file containing protein sequence
10+
"""
11+
import logging
12+
from docopt import docopt
13+
import torch
14+
from esm import pretrained, FastaBatchedDataset
15+
16+
# Load the ESMFold model
17+
model, alphabet = pretrained.esmfold_v0()
18+
model = model.eval().cuda() if torch.cuda.is_available() else model.eval()
19+
20+
# Example FASTA sequence
21+
sequence = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRIA"
22+
23+
# Tokenize sequence and prepare dataset
24+
batch_converter = alphabet.get_batch_converter()
25+
data = [("protein1", sequence)]
26+
batch_labels, batch_strs, batch_tokens = batch_converter(data)
27+
28+
with torch.no_grad():
29+
results = model.infer_pdb(batch_tokens)
30+
31+
# Write the PDB structure to a file
32+
with open("output.pdb", "w") as pdbfile:
33+
pdbfile.write(results)
34+
35+
print("PDB structure saved to output.pdb")

bin/interchain_pairs.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/env python3
2+
"""interchain_pairs
3+
4+
Find closest atom pairs at interchain interface to evidence protein secondary structure
5+
Usage:
6+
interchain_pairs.py [--inpdb=<pdb>]
7+
8+
Options:
9+
--inpdb=<pdb> Input PDB file of protein as obtained from previous process
10+
"""
11+
import logging
12+
from docopt import docopt
13+
import MDAnalysis as mda
14+
from MDAnalysis.analysis import align
15+
from MDAnalysis.analysis import rms
16+
from MDAnalysis import transformations
17+
import pandas as pd
18+
from biopandas.pdb import PandasPdb
19+
from typing import Optional, Tuple, List
20+
import numpy as np
21+
22+
def average_trajectory(pdb: str, pdbout: str):
23+
# Load the structure and trajectory
24+
u = mda.Universe(pdb)
25+
# Create a new Universe with the same topology but without coordinates
26+
#avg_universe = mda.Merge(u.atoms)
27+
ag = u.atoms
28+
new_dimensions = [117.0, 117.0, 117.0, 90, 90, 90]
29+
set_dim = transformations.boxdimensions.set_dimensions(new_dimensions)
30+
transform = transformations.unwrap(ag)
31+
center = transformations.center_in_box(ag.select_atoms('protein'), wrap=True)
32+
u.trajectory.add_transformations(set_dim, transform, center)
33+
protein = u.select_atoms("protein")
34+
avg_universe = mda.Merge(protein)
35+
avg_universe.add_TopologyAttr('tempfactors')
36+
#avg_coordinates = avg_universe.atoms.positions
37+
avg_coordinates = np.zeros_like(avg_universe.atoms.positions)
38+
# Loop over frames, summing up coordinates
39+
for ts in u.trajectory:
40+
avg_coordinates += protein.positions
41+
#avg_coordinates += u.atoms.positions
42+
# Compute the average
43+
avg_coordinates /= len(u.trajectory)
44+
print(len(u.trajectory))
45+
# Assign average coordinates back to avg_universe
46+
avg_universe.atoms.positions = avg_coordinates
47+
# Write the average structure to a PDB file
48+
avg_universe.atoms.write(pdbout)
49+
50+
51+
def get_contact_atoms(pdbout: str, threshold: float):
52+
#read PDB data in pandas dataframe
53+
pdb_data = PandasPdb().read_pdb(pdbout)
54+
#pdb_df = pd.concat([pdb_data.df['ATOM'], pdb_data.df['HETATM']])
55+
pdb_df = pd.concat([pdb_data.df['ATOM']])
56+
pdb_df = pdb_df.dropna(subset=['residue_number'])
57+
#Strings of coordinates, chains and CA to refine dataframe
58+
coord_names = ['x_coord', 'y_coord', 'z_coord']
59+
chain1 = "A"
60+
chain2 = "B"
61+
calpha = "CA"
62+
#Separate chains into separate dataframes
63+
df1 = pdb_df[(pdb_df['chain_id'] == chain1) & (pdb_df['atom_name'] == calpha)]
64+
df2 = pdb_df[(pdb_df['chain_id'] == chain2) & (pdb_df['atom_name'] == calpha)]
65+
#Extract coordinates to numpy
66+
coords1 = df1[coord_names].to_numpy()
67+
coords2 = df2[coord_names].to_numpy()
68+
#Calculate interchain distances
69+
dist_matrix = np.sqrt(((coords1[:, None] - coords2) ** 2).sum(axis=2))
70+
# Create a new dataframe containing pairs of atoms whose distance is below the threshold
71+
pairs = np.argwhere(dist_matrix < threshold)
72+
print(f"Pairs: {pairs.shape}")
73+
print(pairs)
74+
#Identify chain and redidue of atom pairs within distance threshold
75+
atoms1, atoms2 = df1.iloc[pairs[:, 0]], df2.iloc[pairs[:, 1]]
76+
distances = dist_matrix[pairs[:, 0], pairs[:, 1]]
77+
print(f"Length of atoms1: {len(atoms1)}")
78+
print(f"Length of atoms2: {len(atoms2)}")
79+
print(f"Length of distances: {len(distances)}")
80+
print(distances)
81+
atoms1_id = atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1['residue_number'].map(str)
82+
atoms2_id = atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str)
83+
node_pairs = np.vstack((atoms1_id.values, atoms2_id.values, distances)).T
84+
#node_pairs_df = pd.DataFrame({ 'Atom1_ID': atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1 ['residue_number'].map(str), 'Atom2_ID': atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str), 'Distance': distances})
85+
#node_pairs_df = pd.DataFrame({
86+
#'Atom1_ID': atoms1['chain_id'].astype(str) + ":" + atoms1['residue_name'] + ":" + atoms1['residue_number'].astype(str),
87+
#'Atom2_ID': atoms2['chain_id'].astype(str) + ":" + atoms2['residue_name'] + ":" + atoms2['residue_number'].astype(str),
88+
#'Distance': distances})
89+
90+
result = pd.concat([df1.iloc[np.unique(pairs[:, 0])], df2.iloc[np.unique(pairs[:, 1])]])
91+
return node_pairs, result
92+
#return node_pairs_df, result
93+
94+
def main():
95+
arguments = docopt(__doc__, version='interchain_pairs.py')
96+
pdb = arguments['--inpdb']
97+
pdbstem = pdb.replace(".pdb","")
98+
pdbout = str(pdbstem + "_average.pdb")
99+
csvout = str(pdbstem + "_interchain_pairs.csv")
100+
average_trajectory(pdb, pdbout)
101+
threshold = 15.0
102+
out = get_contact_atoms(pdbout, threshold)
103+
node_pairs = out[0]
104+
node_pairs_df = pd.DataFrame(node_pairs, columns=['Atom1', 'Atom2', 'Distance'])
105+
node_pairs_df.to_csv(csvout)
106+
107+
if __name__ == '__main__':
108+
arguments = docopt(__doc__, version='interchain_pairs.py')
109+
logging.getLogger().setLevel(logging.INFO)
110+
main()

bin/mutant_creator.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/usr/bin/env python3
2+
"""mutant_creator
3+
4+
Create variants to the wildtype PDB based on a CSV file containing a list of named variants with corresponding list of mutations for each one
5+
Usage:
6+
mutant_creator.py [--wtin=<wtin>] [--varlist=<varlist>] [--pH=<pH>]
7+
8+
Options:
9+
--wtin=<wtin> Wildype PDB file to be mutated
10+
--varlist=<varlist> Input CSV file containing variants with mutation lists
11+
--pH=<pH> Set pH to desired
12+
"""
13+
import logging
14+
from docopt import docopt
15+
from pdbfixer import PDBFixer
16+
from openmm.app import PDBFile
17+
import csv
18+
from Bio.SeqUtils import seq3
19+
from Bio import pairwise2
20+
from Bio.pairwise2 import format_alignment
21+
22+
23+
#function find_mutations for use when sequence data available only
24+
def find_mutations(ref_seq, mut_seq):
25+
alignments = pairwise2.align.globalxx(ref_seq, mut_seq)
26+
aligned_ref, aligned_mut = alignments[0][0], alignments[0][1]
27+
mutations = []
28+
for i, (r, m) in enumerate(zip(aligned_ref, aligned_mut)):
29+
if r != m:
30+
mutation = f"{r}{i+1}{m}"
31+
mutations.append(mutation)
32+
return mutations
33+
34+
#convert mutations to a version readable by PDBFixer
35+
def convert_mutation_to_pdbfixer(mutation):
36+
original_aa = mutation[0]
37+
position = mutation[1:-1]
38+
new_aa = mutation[-1]
39+
40+
# Convert to three-letter codes
41+
original_aa_3letter = seq3(original_aa).upper()
42+
new_aa_3letter = seq3(new_aa).upper()
43+
44+
# Format for PDBFixer, e.g., 'GLU-48-ASP'
45+
pdbfixer_format = f"{original_aa_3letter}-{position}-{new_aa_3letter}"
46+
return pdbfixer_format
47+
48+
#Make aure WT is in the correct format
49+
def clean_wildtype(pdbname: str, pH: str, pdbout: str):
50+
pH_fl = float(pH)
51+
pdb = PDBFixer(pdbname)
52+
#numChains = len(list(pdb.topology.chains()))
53+
#pdb.removeChains(range(1, numChains))
54+
pdb.findMissingResidues()
55+
#pdb.missingResidues = {}
56+
pdb.findNonstandardResidues()
57+
pdb.replaceNonstandardResidues()
58+
pdb.removeHeterogens(False)
59+
pdb.findMissingAtoms()
60+
pdb.addMissingAtoms()
61+
pdb.addMissingHydrogens(pH_fl)
62+
#PDBFile.writeFile(pdb.topology, pdb.positions, open("wildtype_fixed.pdb", 'w'), keepIds=True)
63+
PDBFile.writeFile(pdb.topology, pdb.positions, open(pdbout, 'w'))
64+
return pdb
65+
66+
#create vairants, implamenting mutations across both chains
67+
def create_mutants(pdbname: str, mutant: list, chain: list, pH: str, pdbout: str):
68+
pH_fl = float(pH)
69+
mutpdb = PDBFixer(pdbname)
70+
for ch_list in chain:
71+
for mut_list in mutant:
72+
mutpdb.applyMutations([mut_list], ch_list)
73+
#mutpdb.applyMutations([mutant], chain)
74+
#mutpdb.findMissingResidues()
75+
mutpdb.missingResidues = {}
76+
#mutpdb.findNonstandardResidues()
77+
#mutpdb.replaceNonstandardResidues()
78+
mutpdb.removeHeterogens(False)
79+
mutpdb.findMissingAtoms()
80+
mutpdb.addMissingAtoms()
81+
mutpdb.addMissingHydrogens(pH_fl)
82+
PDBFile.writeFile(mutpdb.topology, mutpdb.positions, open( pdbout, 'w'), keepIds=True)
83+
return mutpdb
84+
85+
def main():
86+
arguments = docopt(__doc__, version='mutant_creator.py')
87+
wt = "wildtype_centered.pdb"
88+
clean_wildtype(arguments['--wtin'], arguments['--pH'], wt)
89+
chain = ["A", "B"]
90+
#Uncomment below line in case of sequence data, where mutations must be identified
91+
#wt_sequence = "wt sequence here"
92+
with open(arguments['--varlist'], 'r') as csvfile:
93+
reader = csv.DictReader(csvfile)
94+
for row in reader:
95+
#comment below line in case of sequence data, uncomment line below it
96+
stem = row['design_id']
97+
#stem = row['description']
98+
pdbout = str(stem + "_centered.pdb")
99+
#Uncomment following two lines and comment out two lines below them if sequence data only
100+
#mut_sequence = row['sequence']
101+
#filtered_list = find_mutations(wt_sequence, mut_sequence)
102+
mutation_list = row['mutations'].split(';')
103+
filtered_list = [mutation for mutation in mutation_list if mutation.lower() != 'c-terminal truncation']
104+
formatted_mutations = [convert_mutation_to_pdbfixer(mutation) for mutation in filtered_list]
105+
logging.info("%s %s", pdbout, ', '.join(formatted_mutations))
106+
create_mutants(wt, formatted_mutations, chain, arguments['--pH'], pdbout)
107+
108+
if __name__ == '__main__':
109+
arguments = docopt(__doc__, version='mutant_creator.py')
110+
logging.getLogger().setLevel(logging.INFO)
111+
main()

0 commit comments

Comments
 (0)