Mol2 format implementation addressing theochem#78 and theochem#37

QCMM · May 29, 2019 · 15246e9 · 15246e9
1 parent 7d40670
commit 15246e9
Show file tree

Hide file tree

Showing 3 changed files with 373 additions and 0 deletions.
diff --git a/iodata/formats/mol2.py b/iodata/formats/mol2.py
@@ -0,0 +1,152 @@
+# IODATA is an input and output module for quantum chemistry.
+# Copyright (C) 2011-2019 The IODATA Development Team
+#
+# This file is part of IODATA.
+#
+# IODATA is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+#
+# IODATA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>
+# --
+"""MOL2 file format.
+
+There are different formats of mol2 files. Here the compatibility with AMBER software
+was the main objective to write out files with atomic charges used by antechamber.
+"""
+
+
+from typing import TextIO, Iterator, Tuple
+
+import numpy as np
+
+from ..docstrings import (document_load_one, document_load_many, document_dump_one,
+                          document_dump_many)
+from ..iodata import IOData
+from ..periodic import sym2num, num2sym
+from ..utils import angstrom, LineIterator
+
+
+__all__ = []
+
+
+PATTERNS = ['*.mol2']
+
+
+@document_load_one("MOL2", ['atcoords', 'atnums', 'atcharges', 'atffparams'], ['title'])
+def load_one(lit: LineIterator) -> dict:
+    """Do not edit this docstring. It will be overwritten."""
+    molecule_found = False
+    while True:
+        try:
+            line = next(lit)
+        except StopIteration:
+            break
+        if len(line) > 1:
+            words = line.split()
+            if words[0] == "@<TRIPOS>MOLECULE":
+                # Found another molecule; go one line back and break
+                if molecule_found:
+                    lit.back(line)
+                    break
+                title = next(lit).strip()
+            if words[0] == "@<TRIPOS>ATOM":
+                atnums, atcoords, atchgs, attypes = _load_helper_atoms(lit)
+                atcharges = {"mol2": atchgs}
+                atffparams = {"attypes": attypes}
+                result = {
+                    'atcoords': atcoords,
+                    'atnums': atnums,
+                    'atcharges': atcharges,
+                    'atffparams': atffparams
+                }
+                if title is not None:
+                    result['title'] = title
+                molecule_found = True
+    if molecule_found is False:
+        raise lit.error("Molecule could not be read")
+    return result
+
+
+def _load_helper_atoms(lit: LineIterator) -> Tuple[np.ndarray, np.ndarray, np.ndarray, tuple]:
+    """Load element numbers, coordinates and atomic charges."""
+    atnums = []
+    atcoords = []
+    atchgs = []
+    attypes = []
+    for line in lit:
+        words = line.split()
+        if len(words) <= 6:
+            break
+        # Assume that the first character of atom type is the element
+        try:
+            atnums.append(sym2num.get(words[5][0].title()))
+        except ValueError:
+            print(f'Can not convert atom type {words[5][0]} to element')
+        attypes.append(words[5])
+        atcoords.append([float(words[2]), float(words[3]), float(words[4])])
+        if len(words) == 9:
+            atchgs.append(float(words[8]))
+        else:
+            atchg.append(0.0)
+    atnums = np.array(atnums, int)
+    atcoords = np.array(atcoords) * angstrom
+    atchgs = np.array(atchgs)
+    attypes = tuple(attypes)
+    return atnums, atcoords, atchgs, attypes
+
+
+@document_load_many("MOL2", ['atcoords', 'atnums', 'atcharges', 'atffparams'], ['title'])
+def load_many(lit: LineIterator) -> Iterator[dict]:
+    """Do not edit this docstring. It will be overwritten."""
+    # MOL2 files with more molecules are a simple concatenation of individual MOL2 files,'
+    # making it trivial to load many frames.
+    while True:
+        try:
+            yield load_one(lit)
+        except IOError:
+            return
+
+
+@document_dump_one("MOL2", ['atcoords', 'atnums'], ['atcharges', 'atffparams',  'title'])
+def dump_one(f: TextIO, data: IOData):
+    """Do not edit this docstring. It will be overwritten."""
+    # The first six lines are reserved for comments
+    print("# Mol2 file created with Iodata", file=f)
+    print("\n\n\n\n\n", file=f)
+    print("@<TRIPOS>MOLECULE", file=f)
+    print(data.title or 'Created with IOData', file=f)
+    print("@<TRIPOS>ATOM", file=f)
+    for i in range(data.natom):
+        n = num2sym[data.atnums[i]]
+        x, y, z = data.atcoords[i] / angstrom
+        out1 = f'{i:6d} {n:2s} {x:10.4f} {y:10.4f} {z:10.4f} '
+        if not data.atcharges:
+            # Will write the first atomic charge key of the dictionary
+            charge = data.atcharges[0][i]
+            if not data.attparams:
+                for param, value in data.atffparams.items():
+                    if param == 'attypes':
+                        attype = values[i]
+                        out2 = f'{attype:3s} 1 XXX {charge:1.5f}'
+            else:
+                out2 = f'{n:2s} 1 XXX {charge:1.5f}'
+            print(out1 + out2, file=f)
+        else:
+            out2 = f'{n:2s} 1 XXX 0.0000'
+            print(out1 + out2, file=f)
+
+
+@document_dump_many("MOL2", ['atcoords', 'atnums', 'atcharges'], ['title'])
+def dump_many(f: TextIO, datas: Iterator[IOData]):
+    """Do not edit this docstring. It will be overwritten."""
+    # Similar to load_many, this is relatively easy.
+    for data in datas:
+        dump_one(f, data)
diff --git a/iodata/test/data/caffeine.mol2 b/iodata/test/data/caffeine.mol2
@@ -0,0 +1,114 @@
+@<TRIPOS>MOLECULE
+ZINC00001084
+   24    25     0     0     0
+SMALL
+USER_CHARGES
+
+@<TRIPOS>ATOM
+      1 C1         -0.0178    1.4608    0.0101 C.3       1 <0>         0.0684
+      2 N1          0.0021   -0.0041    0.0020 N.pl3     1 <0>        -0.4126
+      3 C2          0.0085   -0.7996   -1.0853 C.2       1 <0>         0.2683
+      4 N2          0.0275   -2.0544   -0.7142 N.2       1 <0>        -0.4991
+      5 C3          0.0343   -2.1318    0.6190 C.2       1 <0>         0.3175
+      6 C4          0.0238   -0.8312    1.1071 C.2       1 <0>        -0.1559
+      7 C5          0.0338   -0.6202    2.5028 C.2       1 <0>         0.5949
+      8 O1          0.0219    0.5106    2.9550 O.2       1 <0>        -0.5132
+      9 N3          0.0564   -1.6860    3.3300 N.am      1 <0>        -0.5520
+     10 C6          0.0693   -2.9392    2.8440 C.2       1 <0>         0.7070
+     11 O2          0.0891   -3.8755    3.6197 O.2       1 <0>        -0.5319
+     12 N4          0.0527   -3.1841    1.5201 N.am      1 <0>        -0.5109
+     13 C7          0.0693   -4.5651    1.0316 C.3       1 <0>         0.0891
+     14 C8          0.0674   -1.4741    4.7795 C.3       1 <0>         0.0836
+     15 H1          1.0048    1.8381    0.0023 H         1 <0>         0.0879
+     16 H2         -0.5450    1.8219   -0.8730 H         1 <0>         0.0987
+     17 H3         -0.5280    1.8123    0.9069 H         1 <0>         0.1006
+     18 H4         -0.0012   -0.4540   -2.1085 H         1 <0>         0.2318
+     19 H5          1.1010   -4.8936    0.9058 H         1 <0>         0.0732
+     20 H6         -0.4324   -5.2114    1.7518 H         1 <0>         0.0936
+     21 H7         -0.4482   -4.6180    0.0738 H         1 <0>         0.0925
+     22 H8         -0.9574   -1.4357    5.1489 H         1 <0>         0.0769
+     23 H9          0.5971   -2.2951    5.2627 H         1 <0>         0.0968
+     24 H10         0.5705   -0.5340    5.0055 H         1 <0>         0.0949
+@<TRIPOS>BOND
+     1    1    2 1
+     2    1   15 1
+     3    1   16 1
+     4    1   17 1
+     5    2    6 1
+     6    2    3 1
+     7    3    4 2
+     8    3   18 1
+     9    4    5 1
+    10    5   12 1
+    11    5    6 2
+    12    6    7 1
+    13    7    8 2
+    14    7    9 am
+    15    9   10 am
+    16    9   14 1
+    17   10   11 2
+    18   10   12 am
+    19   12   13 1
+    20   13   19 1
+    21   13   20 1
+    22   13   21 1
+    23   14   22 1
+    24   14   23 1
+    25   14   24 1
+@<TRIPOS>MOLECULE
+ZINC00001085
+   24    25     0     0     0
+SMALL
+USER_CHARGES
+
+@<TRIPOS>ATOM
+      1 C1         -0.0100    1.5608    0.0201 C.3       1 <0>         0.0684
+      2 N1          0.0021   -0.0041    0.0020 N.pl3     1 <0>        -0.4126
+      3 C2          0.0085   -0.7996   -1.0853 C.2       1 <0>         0.2683
+      4 N2          0.0275   -2.0544   -0.7142 N.2       1 <0>        -0.4991
+      5 C3          0.0343   -2.1318    0.6190 C.2       1 <0>         0.3175
+      6 C4          0.0238   -0.8312    1.1071 C.2       1 <0>        -0.1559
+      7 C5          0.0338   -0.6202    2.5028 C.2       1 <0>         0.5949
+      8 O1          0.0219    0.5106    2.9550 O.2       1 <0>        -0.5132
+      9 N3          0.0564   -1.6860    3.3300 N.am      1 <0>        -0.5520
+     10 C6          0.0693   -2.9392    2.8440 C.2       1 <0>         0.7070
+     11 O2          0.0891   -3.8755    3.6197 O.2       1 <0>        -0.5319
+     12 N4          0.0527   -3.1841    1.5201 N.am      1 <0>        -0.5109
+     13 C7          0.0693   -4.5651    1.0316 C.3       1 <0>         0.0891
+     14 C8          0.0674   -1.4741    4.7795 C.3       1 <0>         0.0836
+     15 H1          1.0048    1.8381    0.0023 H         1 <0>         0.0879
+     16 H2         -0.5450    1.8219   -0.8730 H         1 <0>         0.0987
+     17 H3         -0.5280    1.8123    0.9069 H         1 <0>         0.1006
+     18 H4         -0.0012   -0.4540   -2.1085 H         1 <0>         0.2318
+     19 H5          1.1010   -4.8936    0.9058 H         1 <0>         0.0732
+     20 H6         -0.4324   -5.2114    1.7518 H         1 <0>         0.0936
+     21 H7         -0.4482   -4.6180    0.0738 H         1 <0>         0.0925
+     22 H8         -0.9574   -1.4357    5.1489 H         1 <0>         0.0769
+     23 H9          0.5971   -2.2951    5.2627 H         1 <0>         0.0968
+     24 H10         0.5705   -0.5340    5.0055 H         1 <0>         0.0949
+@<TRIPOS>BOND
+     1    1    2 1
+     2    1   15 1
+     3    1   16 1
+     4    1   17 1
+     5    2    6 1
+     6    2    3 1
+     7    3    4 2
+     8    3   18 1
+     9    4    5 1
+    10    5   12 1
+    11    5    6 2
+    12    6    7 1
+    13    7    8 2
+    14    7    9 am
+    15    9   10 am
+    16    9   14 1
+    17   10   11 2
+    18   10   12 am
+    19   12   13 1
+    20   13   19 1
+    21   13   20 1
+    22   13   21 1
+    23   14   22 1
+    24   14   23 1
+    25   14   24 1
diff --git a/iodata/test/test_mol2.py b/iodata/test/test_mol2.py
@@ -0,0 +1,107 @@
+# IODATA is an input and output module for quantum chemistry.
+# Copyright (C) 2011-2019 The IODATA Development Team
+#
+# This file is part of IODATA.
+#
+# IODATA is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 3
+# of the License, or (at your option) any later version.
+#
+# IODATA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>
+# --
+"""Test iodata.formats.mol2 module."""
+
+import os
+
+import pytest
+from numpy.testing import assert_equal, assert_allclose
+
+from .common import truncated_file
+from ..api import load_one, load_many, dump_one, dump_many
+from ..utils import angstrom
+try:
+    from importlib_resources import path
+except ImportError:
+    from importlib.resources import path
+
+
+def test_mol2_load_one():
+    # test mol2 one structure
+    with path('iodata.test.data', 'caffeine.mol2') as fn_mol:
+        mol = load_one(str(fn_mol))
+    check_example(mol)
+
+
+def test_mol2_formaterror(tmpdir):
+    # test if sdf file has the wrong ending without $$$$
+    with path('iodata.test.data', 'caffeine.mol2') as fn_test:
+        with truncated_file(fn_test, 2, 0, tmpdir) as fn:
+            with pytest.raises(IOError):
+                load_one(str(fn))
+
+
+def check_example(mol):
+    """Test some things on example file."""
+    assert mol.title == 'ZINC00001084'
+    assert_equal(mol.natom, 24)
+    assert_equal(mol.atnums, [6, 7, 6, 7, 6, 6, 6, 8, 7, 6, 8, 7, 6, 6,
+                              1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    assert mol.atffparams['attypes'][0] == 'C.3'
+    # check coordinates
+    atcoords_ang = mol.atcoords / angstrom
+    assert_allclose(atcoords_ang[0], [-0.0178, 1.4608, 0.0101])
+    assert_allclose(atcoords_ang[1], [0.0021, -0.0041, 0.0020])
+    assert_allclose(atcoords_ang[22], [0.5971, -2.2951, 5.2627])
+    assert_allclose(atcoords_ang[23], [0.5705, -0.5340, 5.0055])
+    assert_allclose(mol.atcharges['mol2'][0], 0.0684)
+    assert_allclose(mol.atcharges['mol2'][23], 0.0949)
+
+
+def check_load_dump_consistency(tmpdir, fn):
+    """Check if dumping and loading an SDF file results in the same data."""
+    mol0 = load_one(str(fn))
+    # write sdf file in a temporary folder & then read it
+    fn_tmp = os.path.join(tmpdir, 'test.mol2')
+    dump_one(mol0, fn_tmp, fmt='mol2')
+    mol1 = load_one(fn_tmp)
+    # check two sdf files
+    assert mol0.title == mol1.title
+    assert_equal(mol0.atnums, mol1.atnums)
+    assert_allclose(mol0.atcoords, mol1.atcoords, atol=1.e-5)
+
+
+def test_load_dump_consistency(tmpdir):
+    with path('iodata.test.data', 'caffeine.mol2') as fn_mol2:
+        check_load_dump_consistency(tmpdir, fn_mol2)
+
+
+def test_load_many():
+    with path('iodata.test.data', 'caffeine.mol2') as fn_mol2:
+        mols = list(load_many(str(fn_mol2)))
+    assert len(mols) == 2
+    check_example(mols[0])
+    assert mols[1].title == 'ZINC00001085'
+    assert_equal(mols[1].natom, 24)
+    assert_allclose(mols[0].atcoords[0] / angstrom, [-0.0178, 1.4608, 0.0101])
+    assert_allclose(mols[1].atcoords[0] / angstrom, [-0.0100, 1.5608, 0.0201])
+
+
+def test_load_dump_many_consistency(tmpdir):
+    with path('iodata.test.data', 'caffeine.mol2') as fn_mol2:
+        mols0 = list(load_many(str(fn_mol2)))
+    # write mol2 file in a temporary folder & then read it
+    fn_tmp = os.path.join(tmpdir, 'test')
+    dump_many(mols0, fn_tmp, fmt='mol2')
+    mols1 = list(load_many(fn_tmp, fmt='mol2'))
+    assert len(mols0) == len(mols1)
+    for mol0, mol1 in zip(mols0, mols1):
+        assert mol0.title == mol1.title
+        assert_equal(mol0.atnums, mol1.atnums)
+        assert_allclose(mol0.atcoords, mol1.atcoords, atol=1.e-5)