Skip to content

Commit

Permalink
Merge branch 'master' into fix/lindep_in_real_eigh_tddft
Browse files Browse the repository at this point in the history
  • Loading branch information
puzhichen committed Feb 11, 2025
2 parents 9260769 + c156379 commit 27838e9
Show file tree
Hide file tree
Showing 109 changed files with 10,869 additions and 1,270 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/pypi_wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
ls ${{ github.workspace }}/wheelhouse
- name: Publish to PyPI
run: |
pip install twine
pip install twine==6.0.1
export TWINE_USERNAME=__token__
export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
Expand All @@ -51,7 +51,7 @@ jobs:
ls ${{ github.workspace }}/wheelhouse
- name: Publish to PyPI
run: |
pip install twine
pip install twine==6.0.1
export TWINE_USERNAME=__token__
export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
twine upload --verbose "${{ github.workspace }}/wheelhouse/*"
Expand All @@ -66,7 +66,7 @@ jobs:
python3 setup.py sdist
- name: Publish to PyPI
run: |
pip install twine
pip install twine==6.0.1
export TWINE_USERNAME=__token__
export TWINE_PASSWORD="${{ secrets.PYPI_API_TOKEN }}"
twine upload --verbose "${{ github.workspace }}/dist/*"
9 changes: 9 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
v1.3.1 (2025-02-04)
-------------------
* New Features
- Analytical Hessian for PCM solvent model
- Driver for 3c methods (wB97x-3c, R2Scan-3c, B97-3c, etc.)
* Improvements
- Preconditioner and computation efficiency of Davidson iterations for TDDFT


v1.3.0 (2025-01-07)
-------------------
* New Features
Expand Down
50 changes: 50 additions & 0 deletions examples/40-all_electron_scf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python
# Copyright 2025 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
Gamma point Hartree-Fock/DFT using density fitting approximation
'''

import numpy as np
import pyscf

cell = pyscf.M(
a = np.eye(3)*3.5668,
atom = '''C 0. 0. 0.
C 0.8917 0.8917 0.8917
C 1.7834 1.7834 0.
C 2.6751 2.6751 0.8917
C 1.7834 0. 1.7834
C 2.6751 0.8917 2.6751
C 0. 1.7834 1.7834
C 0.8917 2.6751 2.6751''',
basis = 'ccpvdz',
verbose = 5,
)

#
# Gamma point HF and DFT
#
mf = cell.RHF().to_gpu().density_fit().run()

mf = cell.RKS(xc='pbe0').to_gpu().density_fit().run()

#
# K-point sampled HF and DFT
#
kpts = cell.make_kpts([2,2,2])
kmf = cell.KRHF(kpts=kpts).to_gpu().density_fit().run()

kmf = cell.KRKS(xc='pbe0', kpts=kpts).to_gpu().density_fit().run()
17 changes: 11 additions & 6 deletions gpu4pyscf/__config__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

import cupy

_num_devices = cupy.cuda.runtime.getDeviceCount()
num_devices = cupy.cuda.runtime.getDeviceCount()

# TODO: switch to non_blocking stream (currently blocked by libxc)
_streams = [None] * _num_devices
for device_id in range(_num_devices):
_streams = [None] * num_devices
for device_id in range(num_devices):
with cupy.cuda.Device(device_id):
_streams[device_id] = cupy.cuda.stream.Stream(non_blocking=False)

Expand All @@ -38,11 +38,16 @@
mem_fraction = 0.9
cupy.get_default_memory_pool().set_limit(fraction=mem_fraction)

if props['sharedMemPerBlockOptin'] > 65536:
shm_size = props['sharedMemPerBlockOptin']
else:
shm_size = props['sharedMemPerBlock']

# Check P2P data transfer is available
_p2p_access = True
if _num_devices > 1:
for src in range(_num_devices):
for dst in range(_num_devices):
if num_devices > 1:
for src in range(num_devices):
for dst in range(num_devices):
if src != dst:
can_access_peer = cupy.cuda.runtime.deviceCanAccessPeer(src, dst)
_p2p_access &= can_access_peer
2 changes: 1 addition & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = '1.3.0'
__version__ = '1.3.1'

from . import lib, grad, hessian, solvent, scf, dft, tdscf
18 changes: 9 additions & 9 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
from gpu4pyscf.__config__ import _streams, _num_devices
from gpu4pyscf.__config__ import _streams, num_devices

MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128)
ALIGNED = getattr(__config__, 'ao_aligned', 32)
Expand Down Expand Up @@ -218,17 +218,17 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
# CDERI will be equally distributed to the devices
# Other devices usually have more memory available than Device 0
# CDERI will use up to 40% of the available memory
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * _num_devices
use_gpu_memory = naux * npairs * 8 < 0.4 * avail_mem * num_devices

if use_gpu_memory:
log.debug("Saving CDERI on GPU")
else:
log.debug("Saving CDERI on CPU")

_cderi = {}
aux_blksize = (naux + _num_devices - 1) // _num_devices
aux_blksize = (naux + num_devices - 1) // num_devices
aux_blksize = (aux_blksize + ALIGNED - 1) // ALIGNED * ALIGNED
for device_id in range(_num_devices):
for device_id in range(num_devices):
p0 = min(aux_blksize*device_id, naux)
p1 = min(aux_blksize*(device_id+1), naux)
#for device_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
Expand All @@ -246,16 +246,16 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low,
npairs_per_ctr = np.array(npairs_per_ctr)
total_task_list = np.argsort(npairs_per_ctr)
task_list_per_device = []
for device_id in range(_num_devices):
task_list_per_device.append(total_task_list[device_id::_num_devices])
for device_id in range(num_devices):
task_list_per_device.append(total_task_list[device_id::num_devices])

cd_low_f = cupy.array(cd_low, order='F', copy=False)
cd_low_f = tag_array(cd_low_f, tag=cd_low.tag)

cupy.cuda.get_current_stream().synchronize()
futures = []
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
task_list = task_list_per_device[device_id]
future = executor.submit(_cderi_task, intopt, cd_low_f, task_list, _cderi, aux_blksize,
omega=omega, sr_only=sr_only, device_id=device_id)
Expand Down Expand Up @@ -352,7 +352,7 @@ def _cderi_task(intopt, cd_low, task_list, _cderi, aux_blksize,
for slice_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
tmp = cupy.array(cderi_block[p0:p1], order='C', copy=True)
copy_array(tmp, _cderi[slice_id][:p1-p0,ij0:ij1])
elif _num_devices > 1:
elif num_devices > 1:
# Multi-GPU case, copy data to other Devices
for dev_id, (p0,p1) in enumerate(lib.prange(0, naux, aux_blksize)):
# Making a copy for contiguous data transfer
Expand Down
14 changes: 7 additions & 7 deletions gpu4pyscf/df/df_jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from gpu4pyscf.dft import rks, uks, numint
from gpu4pyscf.scf import hf, uhf
from gpu4pyscf.df import df, int3c2e
from gpu4pyscf.__config__ import _streams, _num_devices
from gpu4pyscf.__config__ import _streams, num_devices

def _pin_memory(array):
mem = cupy.cuda.alloc_pinned_memory(array.nbytes)
Expand Down Expand Up @@ -453,8 +453,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
mo_coeff = intopt.sort_orbitals(mo_coeff, axis=[1])

futures = []
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_mo,
dfobj, dms, mo_coeff, mo_occ,
Expand All @@ -474,8 +474,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
mo1s = [intopt.sort_orbitals(mo1, axis=[1]) for mo1 in mo1s]

futures = []
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_mo1,
dfobj, dms, mo1s, occ_coeffs,
Expand All @@ -486,8 +486,8 @@ def get_jk(dfobj, dms_tag, hermi=0, with_j=True, with_k=True, direct_scf_tol=1e-
# general K matrix with density matrix
else:
futures = []
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_dm, dfobj, dms,
hermi=hermi, device_id=device_id,
Expand Down
12 changes: 6 additions & 6 deletions gpu4pyscf/df/grad/jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from gpu4pyscf.df.int3c2e import get_int3c2e_ip_jk, VHFOpt, _split_tasks
from gpu4pyscf.lib.cupy_helper import contract, concatenate, reduce_to_device
from gpu4pyscf.lib import logger
from gpu4pyscf.__config__ import _streams, _num_devices
from gpu4pyscf.__config__ import _streams, num_devices

def _jk_task(with_df, dm, orbo, with_j=True, with_k=True, device_id=0):
''' # (L|ij) -> rhoj: (L), rhok: (L|oo)
Expand Down Expand Up @@ -61,8 +61,8 @@ def get_rhojk(with_df, dm, orbo, with_j=True, with_k=True):
'''
futures = []
cupy.cuda.get_current_stream().synchronize()
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_jk_task, with_df, dm, orbo,
with_j=with_j, with_k=with_k, device_id=device_id)
Expand Down Expand Up @@ -161,12 +161,12 @@ def get_grad_vjk(with_df, mol, auxmol, rhoj_cart, dm_cart, rhok_cart, orbo_cart,

aux_ao_loc = np.array(intopt.aux_ao_loc)
loads = aux_ao_loc[1:] - aux_ao_loc[:-1]
task_list = _split_tasks(loads, _num_devices)
task_list = _split_tasks(loads, num_devices)

futures = []
cupy.cuda.get_current_stream().synchronize()
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_jk_ip_task, intopt, rhoj_cart, dm_cart, rhok_cart, orbo_cart, task_list[device_id],
with_j=with_j, with_k=with_k, device_id=device_id, omega=omega)
Expand Down
14 changes: 7 additions & 7 deletions gpu4pyscf/df/hessian/jk.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from gpu4pyscf.hessian.jk import _ao2mo
from gpu4pyscf.lib import logger
from gpu4pyscf.lib.cupy_helper import contract, cart2sph, reduce_to_device
from gpu4pyscf.__config__ import _streams, _num_devices
from gpu4pyscf.__config__ import _streams, num_devices

NROOT_ON_GPU = 7

Expand Down Expand Up @@ -171,8 +171,8 @@ def get_jk(dfobj, dms_tag, mo_coeff, mocc, hermi=0,
mo_coeff = [intopt.sort_orbitals(mo, axis=[0]) for mo in mo_coeff]

futures = []
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_jk_task_with_mo1,
dfobj, dms, mo_coeff, mo1s, occ_coeffs,
Expand Down Expand Up @@ -415,12 +415,12 @@ def get_int3c2e_hjk(intopt, rhoj, rhok, dm0_tag, with_j=True, with_k=True,
ncp_ij = len(intopt.log_qs)
tasks = np.array(list(itertools.product(range(ncp_k), range(ncp_ij))))
task_list = []
for device_id in range(_num_devices):
task_list.append(tasks[device_id::_num_devices])
for device_id in range(num_devices):
task_list.append(tasks[device_id::num_devices])

cupy.cuda.get_current_stream().synchronize()
with ThreadPoolExecutor(max_workers=_num_devices) as executor:
for device_id in range(_num_devices):
with ThreadPoolExecutor(max_workers=num_devices) as executor:
for device_id in range(num_devices):
future = executor.submit(
_int3c2e_ipip_tasks, intopt, task_list[device_id],
rhoj, rhok, dm0_tag, orbo, with_j=with_j, with_k=with_k,
Expand Down
4 changes: 2 additions & 2 deletions gpu4pyscf/df/hessian/rks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
mocc = mo_coeff[:,mo_occ>0]
dm0 = numpy.dot(mocc, mocc.T) * 2

if mf.nlc != '':
raise NotImplementedError
if mf.do_nlc():
raise NotImplementedError("2nd derivative of NLC is not implemented.")

omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
Expand Down
4 changes: 2 additions & 2 deletions gpu4pyscf/df/hessian/uks.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
moccb = mo_coeff[1][:,mo_occ[1]>0]
dm0a = numpy.dot(mocca, mocca.T)
dm0b = numpy.dot(moccb, moccb.T)
if mf.nlc != '':
raise NotImplementedError
if mf.do_nlc():
raise NotImplementedError("2nd derivative of NLC is not implemented.")

omega, alpha, hyb = mf._numint.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
with_k = mf._numint.libxc.is_hybrid_xc(mf.xc)
Expand Down
Loading

0 comments on commit 27838e9

Please sign in to comment.