From 6639d26566c24484588ad55a765dcd7a04247479 Mon Sep 17 00:00:00 2001 From: Hanqing Wu Date: Tue, 31 Oct 2023 19:27:20 +0800 Subject: [PATCH] Fix metaserver deadlock caused by bthread coroutine switching Signed-off-by: Hanqing Wu --- curvefs/src/metaserver/partition.cpp | 97 ++++++++++++++++------------ src/common/concurrent/rw_lock.h | 58 +++++++++++++++-- 2 files changed, 109 insertions(+), 46 deletions(-) diff --git a/curvefs/src/metaserver/partition.cpp b/curvefs/src/metaserver/partition.cpp index 3c3e7d7b99..11663cb012 100644 --- a/curvefs/src/metaserver/partition.cpp +++ b/curvefs/src/metaserver/partition.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -537,8 +538,17 @@ MetaStatusCode Partition::GetAllBlockGroup( } void Partition::StartS3Compact() { - S3CompactManager::GetInstance().Register( - S3Compact{inodeManager_, partitionInfo_}); + // register s3 compaction task in a separate thread, since the caller may + // holds a pthread wrlock when calling this function, and create `S3Compact` + // will acquire a bthread rwlock, may cause thread switching, thus causing a + // deadlock. + // FIXME(wuhanqing): handle it in a more elegant way + auto handle = std::async(std::launch::async, [this]() { + S3CompactManager::GetInstance().Register( + S3Compact{inodeManager_, partitionInfo_}); + }); + + handle.wait(); } void Partition::CancelS3Compact() { @@ -546,45 +556,50 @@ void Partition::CancelS3Compact() { } void Partition::StartVolumeDeallocate() { - FsInfo fsInfo; - bool ok = - FsInfoManager::GetInstance().GetFsInfo(partitionInfo_.fsid(), &fsInfo); - if (!ok) { - LOG(ERROR) - << "Partition start volume deallocate fail, get fsinfo fail. fsid=" - << partitionInfo_.fsid(); - return; - } - - if (!fsInfo.detail().has_volume()) { - LOG(INFO) << "Partition not belong to volume, do not need start " - "deallocate. partitionInfo=" - << partitionInfo_.DebugString(); - return; - } - - VolumeDeallocateCalOption calOpt; - calOpt.kvStorage = kvStorage_; - calOpt.inodeStorage = inodeStorage_; - calOpt.nameGen = nameGen_; - auto copysetNode = - copyset::CopysetNodeManager::GetInstance().GetSharedCopysetNode( - partitionInfo_.poolid(), partitionInfo_.copysetid()); - if (copysetNode == nullptr) { - LOG(ERROR) << "Partition get copyset node failed. poolid=" - << partitionInfo_.poolid() - << ", copysetid=" << partitionInfo_.copysetid(); - return; - } - - InodeVolumeSpaceDeallocate task(partitionInfo_.fsid(), - partitionInfo_.partitionid(), copysetNode); - task.Init(calOpt); - - VolumeDeallocateManager::GetInstance().Register(std::move(task)); - - VLOG(3) << "Partition start volume deallocate success. partitionInfo=" - << partitionInfo_.DebugString(); + // FIXME(wuhanqing): same as `StartS3Compact` + auto handle = std::async(std::launch::async, [this]() { + FsInfo fsInfo; + bool ok = FsInfoManager::GetInstance().GetFsInfo( + partitionInfo_.fsid(), &fsInfo); + if (!ok) { + LOG(ERROR) << "Partition start volume deallocate fail, get fsinfo " + "fail. fsid=" + << partitionInfo_.fsid(); + return; + } + + if (!fsInfo.detail().has_volume()) { + LOG(INFO) << "Partition not belong to volume, do not need start " + "deallocate. partitionInfo=" + << partitionInfo_.DebugString(); + return; + } + + VolumeDeallocateCalOption calOpt; + calOpt.kvStorage = kvStorage_; + calOpt.inodeStorage = inodeStorage_; + calOpt.nameGen = nameGen_; + auto copysetNode = + copyset::CopysetNodeManager::GetInstance().GetSharedCopysetNode( + partitionInfo_.poolid(), partitionInfo_.copysetid()); + if (copysetNode == nullptr) { + LOG(ERROR) << "Partition get copyset node failed. poolid=" + << partitionInfo_.poolid() + << ", copysetid=" << partitionInfo_.copysetid(); + return; + } + + InodeVolumeSpaceDeallocate task( + partitionInfo_.fsid(), partitionInfo_.partitionid(), copysetNode); + task.Init(calOpt); + + VolumeDeallocateManager::GetInstance().Register(std::move(task)); + + VLOG(3) << "Partition start volume deallocate success. partitionInfo=" + << partitionInfo_.DebugString(); + }); + + handle.wait(); } void Partition::CancelVolumeDeallocate() { diff --git a/src/common/concurrent/rw_lock.h b/src/common/concurrent/rw_lock.h index d7c47c7d3c..807afb3b8c 100644 --- a/src/common/concurrent/rw_lock.h +++ b/src/common/concurrent/rw_lock.h @@ -23,13 +23,31 @@ #ifndef SRC_COMMON_CONCURRENT_RW_LOCK_H_ #define SRC_COMMON_CONCURRENT_RW_LOCK_H_ -#include #include -#include #include +#include +#include +#include // gettid +#include "include/curve_compiler_specific.h" #include "src/common/uncopyable.h" +// Due to the mixed use of bthread and pthread in some cases, acquiring another +// bthread lock(mutex/rwlock) after acquiring a write lock on a pthread rwlock +// may result in switching the bthread coroutine, and then the operation of +// releasing the previous write lock in the other pthread will not take effect +// (implying that the write lock is still held), thus causing a deadlock. + +// Check pthread rwlock tid between wrlock and unlock +#if defined(ENABLE_CHECK_PTHREAD_WRLOCK_TID) && \ + (ENABLE_CHECK_PTHREAD_WRLOCK_TID == 1) +#define CURVE_CHECK_PTHREAD_WRLOCK_TID 1 +#elif !defined(ENABLE_CHECK_PTHREAD_WRLOCK_TID) +#define CURVE_CHECK_PTHREAD_WRLOCK_TID 1 +#else +#define CURVE_CHECK_PTHREAD_WRLOCK_TID 0 +#endif + namespace curve { namespace common { @@ -51,10 +69,21 @@ class PthreadRWLockBase : public RWLockBase { void WRLock() override { int ret = pthread_rwlock_wrlock(&rwlock_); CHECK(0 == ret) << "wlock failed: " << ret << ", " << strerror(ret); +#if CURVE_CHECK_PTHREAD_WRLOCK_TID + tid_ = gettid(); +#endif } int TryWRLock() override { - return pthread_rwlock_trywrlock(&rwlock_); + int ret = pthread_rwlock_trywrlock(&rwlock_); + if (CURVE_UNLIKELY(ret != 0)) { + return ret; + } + +#if CURVE_CHECK_PTHREAD_WRLOCK_TID + tid_ = gettid(); +#endif + return 0; } void RDLock() override { @@ -67,6 +96,19 @@ class PthreadRWLockBase : public RWLockBase { } void Unlock() override { +#if CURVE_CHECK_PTHREAD_WRLOCK_TID + if (tid_ != 0) { + const pid_t current = gettid(); + // If CHECK here is triggered, please look at the comments at the + // beginning of the file. + // In the meantime, the simplest solution might be to use + // `BthreadRWLock` locks everywhere. + CHECK(tid_ == current) + << ", tid has changed, previous tid: " << tid_ + << ", current tid: " << current; + tid_ = 0; + } +#endif pthread_rwlock_unlock(&rwlock_); } @@ -76,8 +118,14 @@ class PthreadRWLockBase : public RWLockBase { pthread_rwlock_t rwlock_; pthread_rwlockattr_t rwlockAttr_; + +#if CURVE_CHECK_PTHREAD_WRLOCK_TID + pid_t tid_ = 0; +#endif }; +#undef CURVE_CHECK_PTHREAD_WRLOCK_TID + class RWLock : public PthreadRWLockBase { public: RWLock() { @@ -122,7 +170,7 @@ class BthreadRWLock : public RWLockBase { } int TryWRLock() override { - // not support yet + LOG(WARNING) << "TryWRLock not support yet"; return EINVAL; } @@ -132,7 +180,7 @@ class BthreadRWLock : public RWLockBase { } int TryRDLock() override { - // not support yet + LOG(WARNING) << "TryRDLock not support yet"; return EINVAL; }