|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""interchain_pairs |
| 3 | +
|
| 4 | +Find closest atom pairs at interchain interface to evidence protein secondary structure |
| 5 | +Usage: |
| 6 | +interchain_pairs.py [--inpdb=<pdb>] |
| 7 | +
|
| 8 | +Options: |
| 9 | +--inpdb=<pdb> Input PDB file of protein as obtained from previous process |
| 10 | +""" |
| 11 | +import logging |
| 12 | +from docopt import docopt |
| 13 | +import MDAnalysis as mda |
| 14 | +from MDAnalysis.analysis import align |
| 15 | +from MDAnalysis.analysis import rms |
| 16 | +from MDAnalysis import transformations |
| 17 | +import pandas as pd |
| 18 | +from biopandas.pdb import PandasPdb |
| 19 | +from typing import Optional, Tuple, List |
| 20 | +import numpy as np |
| 21 | + |
| 22 | +def average_trajectory(pdb: str, pdbout: str): |
| 23 | + # Load the structure and trajectory |
| 24 | + u = mda.Universe(pdb) |
| 25 | + # Create a new Universe with the same topology but without coordinates |
| 26 | + #avg_universe = mda.Merge(u.atoms) |
| 27 | + ag = u.atoms |
| 28 | + new_dimensions = [117.0, 117.0, 117.0, 90, 90, 90] |
| 29 | + set_dim = transformations.boxdimensions.set_dimensions(new_dimensions) |
| 30 | + transform = transformations.unwrap(ag) |
| 31 | + center = transformations.center_in_box(ag.select_atoms('protein'), wrap=True) |
| 32 | + u.trajectory.add_transformations(set_dim, transform, center) |
| 33 | + protein = u.select_atoms("protein") |
| 34 | + avg_universe = mda.Merge(protein) |
| 35 | + avg_universe.add_TopologyAttr('tempfactors') |
| 36 | + #avg_coordinates = avg_universe.atoms.positions |
| 37 | + avg_coordinates = np.zeros_like(avg_universe.atoms.positions) |
| 38 | + # Loop over frames, summing up coordinates |
| 39 | + for ts in u.trajectory: |
| 40 | + avg_coordinates += protein.positions |
| 41 | + #avg_coordinates += u.atoms.positions |
| 42 | + # Compute the average |
| 43 | + avg_coordinates /= len(u.trajectory) |
| 44 | + print(len(u.trajectory)) |
| 45 | + # Assign average coordinates back to avg_universe |
| 46 | + avg_universe.atoms.positions = avg_coordinates |
| 47 | + # Write the average structure to a PDB file |
| 48 | + avg_universe.atoms.write(pdbout) |
| 49 | + |
| 50 | + |
| 51 | +def get_contact_atoms(pdbout: str, threshold: float): |
| 52 | + #read PDB data in pandas dataframe |
| 53 | + pdb_data = PandasPdb().read_pdb(pdbout) |
| 54 | + #pdb_df = pd.concat([pdb_data.df['ATOM'], pdb_data.df['HETATM']]) |
| 55 | + pdb_df = pd.concat([pdb_data.df['ATOM']]) |
| 56 | + pdb_df = pdb_df.dropna(subset=['residue_number']) |
| 57 | + #Strings of coordinates, chains and CA to refine dataframe |
| 58 | + coord_names = ['x_coord', 'y_coord', 'z_coord'] |
| 59 | + chain1 = "A" |
| 60 | + chain2 = "B" |
| 61 | + calpha = "CA" |
| 62 | + #Separate chains into separate dataframes |
| 63 | + df1 = pdb_df[(pdb_df['chain_id'] == chain1) & (pdb_df['atom_name'] == calpha)] |
| 64 | + df2 = pdb_df[(pdb_df['chain_id'] == chain2) & (pdb_df['atom_name'] == calpha)] |
| 65 | + #Extract coordinates to numpy |
| 66 | + coords1 = df1[coord_names].to_numpy() |
| 67 | + coords2 = df2[coord_names].to_numpy() |
| 68 | + #Calculate interchain distances |
| 69 | + dist_matrix = np.sqrt(((coords1[:, None] - coords2) ** 2).sum(axis=2)) |
| 70 | + # Create a new dataframe containing pairs of atoms whose distance is below the threshold |
| 71 | + pairs = np.argwhere(dist_matrix < threshold) |
| 72 | + print(f"Pairs: {pairs.shape}") |
| 73 | + print(pairs) |
| 74 | + #Identify chain and redidue of atom pairs within distance threshold |
| 75 | + atoms1, atoms2 = df1.iloc[pairs[:, 0]], df2.iloc[pairs[:, 1]] |
| 76 | + distances = dist_matrix[pairs[:, 0], pairs[:, 1]] |
| 77 | + print(f"Length of atoms1: {len(atoms1)}") |
| 78 | + print(f"Length of atoms2: {len(atoms2)}") |
| 79 | + print(f"Length of distances: {len(distances)}") |
| 80 | + print(distances) |
| 81 | + atoms1_id = atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1['residue_number'].map(str) |
| 82 | + atoms2_id = atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str) |
| 83 | + node_pairs = np.vstack((atoms1_id.values, atoms2_id.values, distances)).T |
| 84 | + #node_pairs_df = pd.DataFrame({ 'Atom1_ID': atoms1['chain_id'].map(str) + ":" + atoms1['residue_name'].map(str) + ":" + atoms1 ['residue_number'].map(str), 'Atom2_ID': atoms2['chain_id'].map(str) + ":" + atoms2['residue_name'].map(str) + ":" + atoms2['residue_number'].map(str), 'Distance': distances}) |
| 85 | + #node_pairs_df = pd.DataFrame({ |
| 86 | + #'Atom1_ID': atoms1['chain_id'].astype(str) + ":" + atoms1['residue_name'] + ":" + atoms1['residue_number'].astype(str), |
| 87 | + #'Atom2_ID': atoms2['chain_id'].astype(str) + ":" + atoms2['residue_name'] + ":" + atoms2['residue_number'].astype(str), |
| 88 | + #'Distance': distances}) |
| 89 | + |
| 90 | + result = pd.concat([df1.iloc[np.unique(pairs[:, 0])], df2.iloc[np.unique(pairs[:, 1])]]) |
| 91 | + return node_pairs, result |
| 92 | + #return node_pairs_df, result |
| 93 | + |
| 94 | +def main(): |
| 95 | + arguments = docopt(__doc__, version='interchain_pairs.py') |
| 96 | + pdb = arguments['--inpdb'] |
| 97 | + pdbstem = pdb.replace(".pdb","") |
| 98 | + pdbout = str(pdbstem + "_average.pdb") |
| 99 | + csvout = str(pdbstem + "_interchain_pairs.csv") |
| 100 | + average_trajectory(pdb, pdbout) |
| 101 | + threshold = 15.0 |
| 102 | + out = get_contact_atoms(pdbout, threshold) |
| 103 | + node_pairs = out[0] |
| 104 | + node_pairs_df = pd.DataFrame(node_pairs, columns=['Atom1', 'Atom2', 'Distance']) |
| 105 | + node_pairs_df.to_csv(csvout) |
| 106 | + |
| 107 | +if __name__ == '__main__': |
| 108 | + arguments = docopt(__doc__, version='interchain_pairs.py') |
| 109 | + logging.getLogger().setLevel(logging.INFO) |
| 110 | + main() |
0 commit comments