Merge branch 'master' into fix/lindep_in_real_eigh_tddft

bytedance · Feb 11, 2025 · 27838e9 · 27838e9
2 parents 9260769 + c156379
commit 27838e9
Show file tree

Hide file tree

Showing 109 changed files with 10,869 additions and 1,270 deletions.
diff --git a/.github/workflows/pypi_wheel.yml b/.github/workflows/pypi_wheel.yml
@@ -28,7 +28,7 @@ jobs:
         ls ${{ github.workspace }}/wheelhouse
     - name: Publish to PyPI
       run: |
-          pip install twine
+          pip install twine==6.0.1
           export TWINE_USERNAME=__token__
           export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
           twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
@@ -51,7 +51,7 @@ jobs:
         ls ${{ github.workspace }}/wheelhouse
     - name: Publish to PyPI
       run: |
-          pip install twine
+          pip install twine==6.0.1
           export TWINE_USERNAME=__token__
           export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
           twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
@@ -66,7 +66,7 @@ jobs:
         python3 setup.py sdist
     - name: Publish to PyPI
       run: |
-          pip install twine
+          pip install twine==6.0.1
           export TWINE_USERNAME=__token__
           export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
           twine upload --verbose "${{ github.workspace }}/dist/*"
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,12 @@
+v1.3.1 (2025-02-04)
+-------------------
+* New Features
+  - Analytical Hessian for PCM solvent model
+  - Driver for 3c methods (wB97x-3c, R2Scan-3c, B97-3c, etc.)
+* Improvements
+  - Preconditioner and computation efficiency of Davidson iterations for TDDFT
+
+
 v1.3.0 (2025-01-07)
 -------------------
 * New Features

diff --git a/examples/40-all_electron_scf.py b/examples/40-all_electron_scf.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# Copyright 2025 The PySCF Developers. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Gamma point Hartree-Fock/DFT using density fitting approximation
+'''
+
+import numpy as np
+import pyscf
+
+cell = pyscf.M(
+    a = np.eye(3)*3.5668,
+    atom = '''C     0.      0.      0.    
+              C     0.8917  0.8917  0.8917
+              C     1.7834  1.7834  0.    
+              C     2.6751  2.6751  0.8917
+              C     1.7834  0.      1.7834
+              C     2.6751  0.8917  2.6751
+              C     0.      1.7834  1.7834
+              C     0.8917  2.6751  2.6751''',
+    basis = 'ccpvdz',
+    verbose = 5,
+)
+
+#
+# Gamma point HF and DFT 
+#
+mf = cell.RHF().to_gpu().density_fit().run()
+
+mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run()
+
+#
+# K-point sampled HF and DFT 
+#
+kpts = cell.make_kpts([2,2,2])
+kmf = cell.KRHF(kpts=kpts).to_gpu().density_fit().run()
+
+kmf = cell.KRKS(xc='pbe0', kpts=kpts).to_gpu().density_fit().run()
diff --git a/gpu4pyscf/__config__.py b/gpu4pyscf/__config__.py
@@ -14,11 +14,11 @@
 
 import cupy
 
-_num_devices = cupy.cuda.runtime.getDeviceCount()
+num_devices = cupy.cuda.runtime.getDeviceCount()
 
 # TODO: switch to non_blocking stream (currently blocked by libxc)
-_streams = [None] * _num_devices
-for device_id in range(_num_devices):
+_streams = [None] * num_devices
+for device_id in range(num_devices):
     with cupy.cuda.Device(device_id):
         _streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False)
 
@@ -38,11 +38,16 @@
 mem_fraction = 0.9
 cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)
 
+if props['sharedMemPerBlockOptin'] > 65536:
+    shm_size = props['sharedMemPerBlockOptin']
+else:
+    shm_size = props['sharedMemPerBlock']
+
 # Check P2P data transfer is available
 _p2p_access = True
-if _num_devices > 1:
-    for src in range(_num_devices):
-        for dst in range(_num_devices):
+if num_devices > 1:
+    for src in range(num_devices):
+        for dst in range(num_devices):
             if src != dst:
                 can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
                 _p2p_access &= can_access_peer
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = '1.3.0'
+__version__ = '1.3.1'
 
 from . import lib, grad, hessian, solvent, scf, dft, tdscf
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -25,7 +25,7 @@
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
 ALIGNED = getattr(__config__, 'ao_aligned', 32)
@@ -218,17 +218,17 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
         # CDERI will be equally distributed to the devices
         # Other devices usually have more memory available than Device 0
         # CDERI will use up to 40% of the available memory
-        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
+        use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * num_devices
 
     if use_gpu_memory:
         log.debug("Saving CDERI on GPU")
     else:
         log.debug("Saving CDERI on CPU")
 
     _cderi = {}
-    aux_blksize = (naux + _num_devices - 1) // _num_devices
+    aux_blksize = (naux + num_devices - 1) // num_devices
     aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
-    for device_id in range(_num_devices):
+    for device_id in range(num_devices):
         p0 = min(aux_blksize*device_id, naux)
         p1 = min(aux_blksize*(device_id+1), naux)
         #for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
@@ -246,16 +246,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
     npairs_per_ctr = np.array(npairs_per_ctr)
     total_task_list = np.argsort(npairs_per_ctr)
     task_list_per_device = []
-    for device_id in range(_num_devices):
-        task_list_per_device.append(total_task_list[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list_per_device.append(total_task_list[device_id::num_devices])
 
     cd_low_f = cupy.array(cd_low, order='F', copy=False)
     cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)
 
     cupy.cuda.get_current_stream().synchronize()
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             task_list = task_list_per_device[device_id]
             future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
                                      omega=omega, sr_only=sr_only, device_id=device_id)
@@ -352,7 +352,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
                 for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
                     tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
                     copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
-            elif _num_devices > 1:
+            elif num_devices > 1:
                 # Multi-GPU case, copy data to other Devices
                 for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
                     # Making a copy for contiguous data transfer

diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py
@@ -26,7 +26,7 @@
 from gpu4pyscf.dft import rks, uks, numint
 from gpu4pyscf.scf import hf, uhf
 from gpu4pyscf.df import df, int3c2e
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 def _pin_memory(array):
     mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
@@ -453,8 +453,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])
 
         futures = []
-        with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-            for device_id in range(_num_devices):
+        with ThreadPoolExecutor(max_workers=num_devices) as executor:
+            for device_id in range(num_devices):
                 future = executor.submit(
                     _jk_task_with_mo,
                     dfobj, dms, mo_coeff, mo_occ,
@@ -474,8 +474,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
         mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]
 
         futures = []
-        with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-            for device_id in range(_num_devices):
+        with ThreadPoolExecutor(max_workers=num_devices) as executor:
+            for device_id in range(num_devices):
                 future = executor.submit(
                     _jk_task_with_mo1,
                     dfobj, dms, mo1s, occ_coeffs,
@@ -486,8 +486,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
     # general K matrix with density matrix
     else:
         futures = []
-        with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-            for device_id in range(_num_devices):
+        with ThreadPoolExecutor(max_workers=num_devices) as executor:
+            for device_id in range(num_devices):
                 future = executor.submit(
                     _jk_task_with_dm, dfobj, dms,
                     hermi=hermi, device_id=device_id,

diff --git a/gpu4pyscf/df/grad/jk.py b/gpu4pyscf/df/grad/jk.py
@@ -18,7 +18,7 @@
 from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
 from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
 from gpu4pyscf.lib import logger
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
     '''  # (L|ij) -> rhoj: (L), rhok: (L|oo)
@@ -61,8 +61,8 @@ def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True):
     '''
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_task, with_df, dm, orbo,
                 with_j=with_j, with_k=with_k, device_id=device_id)
@@ -161,12 +161,12 @@ def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,
 
     aux_ao_loc = np.array(intopt.aux_ao_loc)
     loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
-    task_list = _split_tasks(loads, _num_devices)
+    task_list = _split_tasks(loads, num_devices)
 
     futures = []
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id],
                 with_j=with_j, with_k=with_k, device_id=device_id, omega=omega)

diff --git a/gpu4pyscf/df/hessian/jk.py b/gpu4pyscf/df/hessian/jk.py
@@ -23,7 +23,7 @@
 from gpu4pyscf.hessian.jk import _ao2mo
 from gpu4pyscf.lib import logger
 from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
-from gpu4pyscf.__config__ import _streams, _num_devices
+from gpu4pyscf.__config__ import _streams, num_devices
 
 NROOT_ON_GPU = 7
 
@@ -171,8 +171,8 @@ def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0,
     mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff]
 
     futures = []
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _jk_task_with_mo1,
                 dfobj, dms, mo_coeff, mo1s, occ_coeffs,
@@ -415,12 +415,12 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True,
     ncp_ij = len(intopt.log_qs)
     tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
     task_list = []
-    for device_id in range(_num_devices):
-        task_list.append(tasks[device_id::_num_devices])
+    for device_id in range(num_devices):
+        task_list.append(tasks[device_id::num_devices])
 
     cupy.cuda.get_current_stream().synchronize()
-    with ThreadPoolExecutor(max_workers=_num_devices) as executor:
-        for device_id in range(_num_devices):
+    with ThreadPoolExecutor(max_workers=num_devices) as executor:
+        for device_id in range(num_devices):
             future = executor.submit(
                 _int3c2e_ipip_tasks, intopt, task_list[device_id],
                 rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,

diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py
@@ -46,8 +46,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = numpy.dot(mocc, mocc.T) * 2
 
-    if mf.nlc != '':
-        raise NotImplementedError
+    if mf.do_nlc():
+        raise NotImplementedError("2nd derivative of NLC is not implemented.")
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)

diff --git a/gpu4pyscf/df/hessian/uks.py b/gpu4pyscf/df/hessian/uks.py
@@ -48,8 +48,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     moccb = mo_coeff[1][:,mo_occ[1]>0]
     dm0a = numpy.dot(mocca, mocca.T)
     dm0b = numpy.dot(moccb, moccb.T)
-    if mf.nlc != '':
-        raise NotImplementedError
+    if mf.do_nlc():
+        raise NotImplementedError("2nd derivative of NLC is not implemented.")
 
     omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
     with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)