Segid columns change to 73-76 for PDBParser (#5001)

yuyuan871111 · web-flow · commit 7fb3534b1f88 · 2025-03-30T17:24:13.000-07:00
* fix reading of segids
* PDBParser segid column 73-
* update CHANGELOG and AUTHORS
* add logger messages at info/debug in PDBParser for inconsistent segids or when chainids are used
* test_pdb with new segid columns
diff --git a/package/AUTHORS b/package/AUTHORS
@@ -250,7 +250,8 @@ Chronological list of authors
   - Joshua Raphael Uy 
   - Namir Oues
   - Lexi Xu
-  - BHM-Bob G    
+  - BHM-Bob G  
+  - Yu-Yuan (Stuart) Yang
 
 External code
 -------------
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -23,6 +23,9 @@ Fixes
  * Fixes bug in `analysis/gnm.py`: `closeContactGNMAnalysis`: correct the
    `residue_index_map` generation when selection is not `protein`.
    (Issue #4924, PR #4961)
+ * Reads `segids` column in `PDBParser` from 73-76 instead of 67-76 to 
+   align the standard of a PDBReader (e.g., Chimera, CHARMM, Gemmi). 
+   (Issue #4948, PR #5001)
 
 Enhancements
 
diff --git a/package/MDAnalysis/coordinates/PDB.py b/package/MDAnalysis/coordinates/PDB.py
@@ -214,7 +214,8 @@ class PDBReader(base.ReaderBase):
     47 - 54        Real(8.3)     z            Orthogonal coordinates for Z in Angstroms.
     55 - 60        Real(6.2)     occupancy    Occupancy.
     61 - 66        Real(6.2)     tempFactor   Temperature  factor.
-    67 - 76        String        segID        (unofficial CHARMM extension ?)
+    67 - 72                                   (not used in the official PDB format)
+    73 - 76        String        segID        (unofficial PDB format*)
     77 - 78        LString(2)    element      Element symbol, right-justified.
     79 - 80        LString(2)    charge       Charge  on the atom.
     =============  ============  ===========  =============================================
@@ -231,13 +232,19 @@ class PDBReader(base.ReaderBase):
 
     .. _CRYST1: http://www.wwpdb.org/documentation/file-format-content/format33/sect8.html#CRYST1
 
+    *The columns 73-76 are not part of the official PDB format but are used by
+    some programs to store/operate the segment ID. For instance, Chimera_ assigns
+    it as the attribute `pdbSegment` to allow command-line specification.
+
+    .. _Chimera:
+        https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html#note6
 
     See Also
     --------
     :class:`PDBWriter`
     :class:`PDBReader`
 
-    
+
     .. versionchanged:: 0.11.0
        * Frames now 0-based instead of 1-based
        * New :attr:`title` (list with all TITLE lines).
diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py
@@ -67,6 +67,7 @@
 """
 import numpy as np
 import warnings
+import logging
 
 from ..guesser.tables import SYMB2Z
 from ..lib import util
@@ -91,6 +92,9 @@
     FormalCharges,
 )
 
+# Set up a logger for the PDBParser
+logger = logging.getLogger("MDAnalysis.topology.PDBParser")
+
 
 def float_or_default(val, default):
     try:
@@ -202,6 +206,8 @@ class PDBParser(TopologyReaderBase):
     .. versionchanged:: 2.8.0
         Removed type and mass guessing (attributes guessing takes place now
         through universe.guess_TopologyAttrs() API).
+    .. versionchanged:: 2.10.0
+        segID is read from 73-76 instead of 67-76.
     """
     format = ['PDB', 'ENT']
 
@@ -302,15 +308,21 @@ def _parseatoms(self):
                 occupancies.append(float_or_default(line[54:60], 0.0))
                 tempfactors.append(float_or_default(line[60:66], 1.0))  # AKA bfactor
 
-                segids.append(line[66:76].strip())
+                segids.append(line[72:76].strip())
 
         # Warn about wrapped serials
         if self._wrapped_serials:
             warnings.warn("Serial numbers went over 100,000.  "
                           "Higher serials have been guessed")
 
+        # If segids is not equal to chainids, warn the user
+        if any([a != b for a, b in zip(segids, chainids)]):
+            logger.debug("Segment IDs and Chain IDs are not completely equal.")
+
         # If segids not present, try to use chainids
         if not any(segids):
+            logger.info("Setting segids from chainIDs because no segids "
+                        "found in the PDB file.")
             segids = chainids
 
         n_atoms = len(serials)
@@ -403,6 +415,8 @@ def _parseatoms(self):
             n_segments = 1
             attrs.append(Segids(np.array(['SYSTEM'], dtype=object)))
             segidx = None
+            logger.info("Segment/chain ID is empty, "
+                        "setting segids to default value 'SYSTEM'.")
 
         top = Topology(n_atoms, n_residues, n_segments,
                        attrs=attrs,
diff --git a/testsuite/MDAnalysisTests/coordinates/test_pdb.py b/testsuite/MDAnalysisTests/coordinates/test_pdb.py
@@ -1554,3 +1554,61 @@ def test_charges_limit(value):
     arr = np.array([0, 0, 0, value, 1, -1, 0], dtype=int)
     with pytest.raises(ValueError, match="9 is not supported by PDB standard"):
         mda.coordinates.PDB.PDBWriter._format_PDB_charges(arr)
+
+
+def test_read_segids():
+    # test to read the segids using column 73-76 instead of 67-76
+    invalid_seg_format_str = """\
+ATOM    659  N   THR A 315      22.716  15.055  -1.000  1.00 16.08   B       N
+ATOM    660  CA  THR A 315      22.888  13.803  -0.302  1.00  0.00   B       C
+ATOM    661  C   THR A 315      22.006  12.700  -0.882  1.00  0.00   B       C
+ATOM    662  O   THR A 315      21.138  12.959  -1.727  1.00 16.25   B       O
+ATOM    663  CB  THR A 315      22.481  13.956   1.182  1.00  0.00   B       C
+ATOM    664  CG2 THR A 315      23.384  14.924   1.927  1.00  0.00   B       C
+ATOM    665  OG1 THR A 315      21.172  14.548   1.274  1.00  0.00   B       O
+"""
+
+    acceptable_format_str = """\
+ATOM    659  N   THR A 315      22.716  15.055  -1.000  1.00 16.08           N
+ATOM    660  CA  THR A 315      22.888  13.803  -0.302  1.00 152.13          C
+ATOM    661  C   THR A 315      22.006  12.700  -0.882  1.00 15.69           C
+ATOM    662  O   THR A 315      21.138  12.959  -1.727  1.00 116.25          O
+ATOM    663  CB  THR A 315      22.481  13.956   1.182  1.00 16.22           C
+ATOM    664  CG2 THR A 315      22.874  15.310   1.747  1.00 173.26          C
+ATOM    665  OG1 THR A 315      21.047  13.922   1.304  1.00 15.14           O
+"""
+
+    standard_format_str = """\
+ATOM    659  N   THR A 315      22.716  15.055  -1.000  1.00 16.08        B  N
+ATOM    660  CA  THR A 315      22.888  13.803  -0.302  1.00 15.13        B  C
+ATOM    661  C   THR A 315      22.006  12.700  -0.882  1.00 15.69        B  C
+ATOM    662  O   THR A 315      21.138  12.959  -1.727  1.00 16.25        B  O
+ATOM    663  CB  THR A 315      22.481  13.956   1.182  1.00 16.22        B  C
+ATOM    664  CG2 THR A 315      22.874  15.310   1.747  1.00 17.32        B  C
+ATOM    665  OG1 THR A 315      21.047  13.922   1.304  1.00 15.14        B  O
+"""
+
+    u_invalid_segid = mda.Universe(
+        StringIO(invalid_seg_format_str), format="PDB"
+    )
+    u_acceptable = mda.Universe(StringIO(acceptable_format_str), format="PDB")
+    u_standard = mda.Universe(StringIO(standard_format_str), format="PDB")
+
+    # Before version 2.10.0, segid was read from column 67-76.
+    # Thus, segids existed and were set to "B" for all atoms.
+    # After version 2.10.0, segid is read from column 73-76.
+    # segid is expected to set by chainID "A" for all atoms.
+    assert_equal(
+        u_invalid_segid.atoms.segids, ["A"] * len(u_invalid_segid.atoms)
+    )
+
+    # Before version 2.10.0, segid was set to read from column 67-76.
+    # Due to misalignment in b-factor column,
+    # segids were set to ['3', '', '5', '', '6'] for all atoms.
+    # After version 2.10.0, segid is read from column 73-76.
+    # segid is expected to set by chainID "A" for all atoms.
+    assert_equal(u_acceptable.atoms.segids, ["A"] * len(u_standard.atoms))
+
+    # After version 2.10.0, segid is read from column 73-76.
+    # segid is set to "B" for all atoms
+    assert_equal(u_standard.atoms.segids, ["B"] * len(u_standard.atoms))