Skip to content

Commit 7fb3534

Browse files
authored
Segid columns change to 73-76 for PDBParser (#5001)
* fix reading of segids * PDBParser segid column 73- * update CHANGELOG and AUTHORS * add logger messages at info/debug in PDBParser for inconsistent segids or when chainids are used * test_pdb with new segid columns
1 parent 66e7e5c commit 7fb3534

File tree

5 files changed

+87
-4
lines changed

5 files changed

+87
-4
lines changed

package/AUTHORS

+2-1
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,8 @@ Chronological list of authors
250250
- Joshua Raphael Uy
251251
- Namir Oues
252252
- Lexi Xu
253-
- BHM-Bob G
253+
- BHM-Bob G
254+
- Yu-Yuan (Stuart) Yang
254255

255256
External code
256257
-------------

package/CHANGELOG

+3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ Fixes
2323
* Fixes bug in `analysis/gnm.py`: `closeContactGNMAnalysis`: correct the
2424
`residue_index_map` generation when selection is not `protein`.
2525
(Issue #4924, PR #4961)
26+
* Reads `segids` column in `PDBParser` from 73-76 instead of 67-76 to
27+
align the standard of a PDBReader (e.g., Chimera, CHARMM, Gemmi).
28+
(Issue #4948, PR #5001)
2629

2730
Enhancements
2831

package/MDAnalysis/coordinates/PDB.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ class PDBReader(base.ReaderBase):
214214
47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
215215
55 - 60 Real(6.2) occupancy Occupancy.
216216
61 - 66 Real(6.2) tempFactor Temperature factor.
217-
67 - 76 String segID (unofficial CHARMM extension ?)
217+
67 - 72 (not used in the official PDB format)
218+
73 - 76 String segID (unofficial PDB format*)
218219
77 - 78 LString(2) element Element symbol, right-justified.
219220
79 - 80 LString(2) charge Charge on the atom.
220221
============= ============ =========== =============================================
@@ -231,13 +232,19 @@ class PDBReader(base.ReaderBase):
231232
232233
.. _CRYST1: http://www.wwpdb.org/documentation/file-format-content/format33/sect8.html#CRYST1
233234
235+
*The columns 73-76 are not part of the official PDB format but are used by
236+
some programs to store/operate the segment ID. For instance, Chimera_ assigns
237+
it as the attribute `pdbSegment` to allow command-line specification.
238+
239+
.. _Chimera:
240+
https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html#note6
234241
235242
See Also
236243
--------
237244
:class:`PDBWriter`
238245
:class:`PDBReader`
239246
240-
247+
241248
.. versionchanged:: 0.11.0
242249
* Frames now 0-based instead of 1-based
243250
* New :attr:`title` (list with all TITLE lines).

package/MDAnalysis/topology/PDBParser.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
"""
6868
import numpy as np
6969
import warnings
70+
import logging
7071

7172
from ..guesser.tables import SYMB2Z
7273
from ..lib import util
@@ -91,6 +92,9 @@
9192
FormalCharges,
9293
)
9394

95+
# Set up a logger for the PDBParser
96+
logger = logging.getLogger("MDAnalysis.topology.PDBParser")
97+
9498

9599
def float_or_default(val, default):
96100
try:
@@ -202,6 +206,8 @@ class PDBParser(TopologyReaderBase):
202206
.. versionchanged:: 2.8.0
203207
Removed type and mass guessing (attributes guessing takes place now
204208
through universe.guess_TopologyAttrs() API).
209+
.. versionchanged:: 2.10.0
210+
segID is read from 73-76 instead of 67-76.
205211
"""
206212
format = ['PDB', 'ENT']
207213

@@ -302,15 +308,21 @@ def _parseatoms(self):
302308
occupancies.append(float_or_default(line[54:60], 0.0))
303309
tempfactors.append(float_or_default(line[60:66], 1.0)) # AKA bfactor
304310

305-
segids.append(line[66:76].strip())
311+
segids.append(line[72:76].strip())
306312

307313
# Warn about wrapped serials
308314
if self._wrapped_serials:
309315
warnings.warn("Serial numbers went over 100,000. "
310316
"Higher serials have been guessed")
311317

318+
# If segids is not equal to chainids, warn the user
319+
if any([a != b for a, b in zip(segids, chainids)]):
320+
logger.debug("Segment IDs and Chain IDs are not completely equal.")
321+
312322
# If segids not present, try to use chainids
313323
if not any(segids):
324+
logger.info("Setting segids from chainIDs because no segids "
325+
"found in the PDB file.")
314326
segids = chainids
315327

316328
n_atoms = len(serials)
@@ -403,6 +415,8 @@ def _parseatoms(self):
403415
n_segments = 1
404416
attrs.append(Segids(np.array(['SYSTEM'], dtype=object)))
405417
segidx = None
418+
logger.info("Segment/chain ID is empty, "
419+
"setting segids to default value 'SYSTEM'.")
406420

407421
top = Topology(n_atoms, n_residues, n_segments,
408422
attrs=attrs,

testsuite/MDAnalysisTests/coordinates/test_pdb.py

+58
Original file line numberDiff line numberDiff line change
@@ -1554,3 +1554,61 @@ def test_charges_limit(value):
15541554
arr = np.array([0, 0, 0, value, 1, -1, 0], dtype=int)
15551555
with pytest.raises(ValueError, match="9 is not supported by PDB standard"):
15561556
mda.coordinates.PDB.PDBWriter._format_PDB_charges(arr)
1557+
1558+
1559+
def test_read_segids():
1560+
# test to read the segids using column 73-76 instead of 67-76
1561+
invalid_seg_format_str = """\
1562+
ATOM 659 N THR A 315 22.716 15.055 -1.000 1.00 16.08 B N
1563+
ATOM 660 CA THR A 315 22.888 13.803 -0.302 1.00 0.00 B C
1564+
ATOM 661 C THR A 315 22.006 12.700 -0.882 1.00 0.00 B C
1565+
ATOM 662 O THR A 315 21.138 12.959 -1.727 1.00 16.25 B O
1566+
ATOM 663 CB THR A 315 22.481 13.956 1.182 1.00 0.00 B C
1567+
ATOM 664 CG2 THR A 315 23.384 14.924 1.927 1.00 0.00 B C
1568+
ATOM 665 OG1 THR A 315 21.172 14.548 1.274 1.00 0.00 B O
1569+
"""
1570+
1571+
acceptable_format_str = """\
1572+
ATOM 659 N THR A 315 22.716 15.055 -1.000 1.00 16.08 N
1573+
ATOM 660 CA THR A 315 22.888 13.803 -0.302 1.00 152.13 C
1574+
ATOM 661 C THR A 315 22.006 12.700 -0.882 1.00 15.69 C
1575+
ATOM 662 O THR A 315 21.138 12.959 -1.727 1.00 116.25 O
1576+
ATOM 663 CB THR A 315 22.481 13.956 1.182 1.00 16.22 C
1577+
ATOM 664 CG2 THR A 315 22.874 15.310 1.747 1.00 173.26 C
1578+
ATOM 665 OG1 THR A 315 21.047 13.922 1.304 1.00 15.14 O
1579+
"""
1580+
1581+
standard_format_str = """\
1582+
ATOM 659 N THR A 315 22.716 15.055 -1.000 1.00 16.08 B N
1583+
ATOM 660 CA THR A 315 22.888 13.803 -0.302 1.00 15.13 B C
1584+
ATOM 661 C THR A 315 22.006 12.700 -0.882 1.00 15.69 B C
1585+
ATOM 662 O THR A 315 21.138 12.959 -1.727 1.00 16.25 B O
1586+
ATOM 663 CB THR A 315 22.481 13.956 1.182 1.00 16.22 B C
1587+
ATOM 664 CG2 THR A 315 22.874 15.310 1.747 1.00 17.32 B C
1588+
ATOM 665 OG1 THR A 315 21.047 13.922 1.304 1.00 15.14 B O
1589+
"""
1590+
1591+
u_invalid_segid = mda.Universe(
1592+
StringIO(invalid_seg_format_str), format="PDB"
1593+
)
1594+
u_acceptable = mda.Universe(StringIO(acceptable_format_str), format="PDB")
1595+
u_standard = mda.Universe(StringIO(standard_format_str), format="PDB")
1596+
1597+
# Before version 2.10.0, segid was read from column 67-76.
1598+
# Thus, segids existed and were set to "B" for all atoms.
1599+
# After version 2.10.0, segid is read from column 73-76.
1600+
# segid is expected to set by chainID "A" for all atoms.
1601+
assert_equal(
1602+
u_invalid_segid.atoms.segids, ["A"] * len(u_invalid_segid.atoms)
1603+
)
1604+
1605+
# Before version 2.10.0, segid was set to read from column 67-76.
1606+
# Due to misalignment in b-factor column,
1607+
# segids were set to ['3', '', '5', '', '6'] for all atoms.
1608+
# After version 2.10.0, segid is read from column 73-76.
1609+
# segid is expected to set by chainID "A" for all atoms.
1610+
assert_equal(u_acceptable.atoms.segids, ["A"] * len(u_standard.atoms))
1611+
1612+
# After version 2.10.0, segid is read from column 73-76.
1613+
# segid is set to "B" for all atoms
1614+
assert_equal(u_standard.atoms.segids, ["B"] * len(u_standard.atoms))

0 commit comments

Comments
 (0)