Skip to content

Commit 157d2c7

Browse files
author
Changlong Chen
authored
Merge pull request #87 from skyzh/skyzh/zns-gc
add garbage collection for ZenFS
2 parents 00e757f + 9a5e9e7 commit 157d2c7

File tree

14 files changed

+257
-23
lines changed

14 files changed

+257
-23
lines changed

db/db_impl.cc

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#include "table/block_based_table_factory.h"
6969
#include "table/merging_iterator.h"
7070
#include "table/two_level_iterator.h"
71+
#include "third-party/zenfs/fs/zbd_stat.h"
7172
#include "util/autovector.h"
7273
#include "util/build_version.h"
7374
#include "util/c_style_callback.h"
@@ -113,6 +114,8 @@
113114

114115
namespace TERARKDB_NAMESPACE {
115116

117+
std::vector<ZoneStat> GetStat(Env* env);
118+
116119
const std::string kDefaultColumnFamilyName("default");
117120
const uint64_t kDumpStatsWaitMicroseconds = 10000;
118121
const std::string kPersistentStatsColumnFamilyName(
@@ -985,6 +988,163 @@ void DBImpl::ScheduleTtlGC() {
985988
log_buffer_debug.FlushBufferToLog();
986989
}
987990

991+
#ifdef LIBZBD
992+
void DBImpl::ScheduleZNSGC() {
993+
TEST_SYNC_POINT("DBImpl:ScheduleZNSGC");
994+
uint64_t nowSeconds = env_->NowMicros() / 1000U / 1000U;
995+
LogBuffer log_buffer_info(InfoLogLevel::INFO_LEVEL,
996+
immutable_db_options_.info_log.get());
997+
LogBuffer log_buffer_debug(InfoLogLevel::DEBUG_LEVEL,
998+
immutable_db_options_.info_log.get());
999+
1000+
chash_set<uint64_t> mark_for_gc;
1001+
1002+
if (initial_db_options_.zenfs_gc_ratio <= 0.0 ||
1003+
initial_db_options_.zenfs_gc_ratio >= 1.0) {
1004+
// GC is not enabled
1005+
return;
1006+
}
1007+
1008+
// Pick files for GC
1009+
auto stat = GetStat(env_);
1010+
1011+
uint64_t number;
1012+
FileType type;
1013+
1014+
// Merge db paths and column family paths together
1015+
chash_set<std::string> db_paths;
1016+
1017+
// Get column family paths
1018+
mutex_.Lock();
1019+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1020+
for (const auto& path : cfd->ioptions()->db_paths) {
1021+
db_paths.emplace(path.path);
1022+
}
1023+
}
1024+
mutex_.Unlock();
1025+
1026+
// Get database paths
1027+
for (const auto& path : immutable_db_options_.db_paths) {
1028+
db_paths.emplace(path.path);
1029+
}
1030+
1031+
std::string strip_filename;
1032+
1033+
for (const auto& zone : stat) {
1034+
std::vector<uint64_t> sst_in_zone;
1035+
uint64_t written_data = zone.write_position - zone.start_position;
1036+
// zone is full
1037+
if (written_data == zone.total_capacity) {
1038+
uint64_t total_size = 0;
1039+
bool ignore_zone = false;
1040+
for (const auto& file : zone.files) {
1041+
strip_filename.clear();
1042+
1043+
for (const auto& path : db_paths) {
1044+
if (Slice(file.filename).starts_with(path)) {
1045+
strip_filename.assign(file.filename, path.length(),
1046+
file.filename.length() - path.length());
1047+
break;
1048+
}
1049+
}
1050+
1051+
if (strip_filename.empty()) {
1052+
// This file is not in DB folder.
1053+
ignore_zone = true;
1054+
break;
1055+
}
1056+
1057+
if (ParseFileName(strip_filename, &number, Slice(), &type)) {
1058+
// Is SST file, and is of current TerarkDB instance.
1059+
if (type == kTableFile) {
1060+
total_size += file.size_in_zone;
1061+
sst_in_zone.push_back(number);
1062+
} else {
1063+
// This zone contains file other than SSTs or files from other
1064+
// databases. We ignore the zone for now. When other files (like
1065+
// logs) have been deleted, we will come back and recycle this zone.
1066+
ignore_zone = true;
1067+
break;
1068+
}
1069+
} else {
1070+
// This file is not recognized by TerarkDB (or RocksDB). Even if we
1071+
// move the file, the zone may not be reset. Therefore, we simply
1072+
// ignore this zone.
1073+
ignore_zone = true;
1074+
break;
1075+
}
1076+
}
1077+
1078+
if (ignore_zone) {
1079+
continue;
1080+
}
1081+
1082+
// if data in zone <= (1 - ratio) * total_capacity, recycle the zone
1083+
if (total_size <=
1084+
(1.0 - initial_db_options_.zenfs_gc_ratio) * written_data) {
1085+
for (auto&& file_id : sst_in_zone) {
1086+
mark_for_gc.insert(file_id);
1087+
}
1088+
}
1089+
}
1090+
}
1091+
1092+
mutex_.Lock();
1093+
for (auto cfd : *versions_->GetColumnFamilySet()) {
1094+
uint64_t new_mark_count = 0;
1095+
uint64_t old_mark_count = 0;
1096+
uint64_t total_count = 0;
1097+
if (!cfd->initialized() || cfd->IsDropped()) {
1098+
continue;
1099+
}
1100+
VersionStorageInfo* vstorage = cfd->current()->storage_info();
1101+
// Level -1 contains SSTs inside lazy compaction SST index.
1102+
// By iterating level -1, we could collect that kind of garbage.
1103+
// But we still recommend using ZNS GC without lazy compaction
1104+
// enabled.
1105+
for (int l = -1; l < vstorage->num_non_empty_levels(); l++) {
1106+
for (auto meta : vstorage->LevelFiles(l)) {
1107+
if (meta->being_compacted) {
1108+
continue;
1109+
}
1110+
++total_count;
1111+
old_mark_count += meta->marked_for_compaction;
1112+
TEST_SYNC_POINT("DBImpl:Exist-SST");
1113+
if (!meta->marked_for_compaction &&
1114+
mark_for_gc.count(meta->fd.GetNumber()) > 0) {
1115+
meta->marked_for_compaction = true;
1116+
}
1117+
if (meta->marked_for_compaction) {
1118+
new_mark_count++;
1119+
TEST_SYNC_POINT("DBImpl:ScheduleZNSGC-mark");
1120+
}
1121+
}
1122+
}
1123+
if (new_mark_count > old_mark_count) {
1124+
vstorage->ComputeCompactionScore(*cfd->ioptions(),
1125+
*cfd->GetLatestMutableCFOptions());
1126+
if (!cfd->queued_for_compaction()) {
1127+
AddToCompactionQueue(cfd);
1128+
unscheduled_compactions_++;
1129+
}
1130+
}
1131+
if (old_mark_count != 0 && new_mark_count != 0) {
1132+
ROCKS_LOG_BUFFER(&log_buffer_info,
1133+
"[%s] ZNS GC: SSTs total marked = %" PRIu64
1134+
", new marked = %" PRIu64 ", file count: %" PRIu64,
1135+
cfd->GetName().c_str(), old_mark_count, new_mark_count,
1136+
total_count);
1137+
}
1138+
}
1139+
if (unscheduled_compactions_ > 0) {
1140+
MaybeScheduleFlushOrCompaction();
1141+
}
1142+
mutex_.Unlock();
1143+
log_buffer_info.FlushBufferToLog();
1144+
log_buffer_debug.FlushBufferToLog();
1145+
}
1146+
#endif
1147+
9881148
void DBImpl::DumpStats() {
9891149
TEST_SYNC_POINT("DBImpl::DumpStats:1");
9901150
#ifndef ROCKSDB_LITE

db/db_impl.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -410,11 +410,11 @@ class DBImpl : public DB {
410410
// Implemented in db_impl_debug.cc
411411

412412
// Compact any files in the named level that overlap [*begin, *end]
413-
Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
414-
ColumnFamilyHandle* column_family = nullptr,
415-
SeparationType separation_type =
416-
kCompactionTransToSeparate,
417-
bool disallow_trivial_move = false);
413+
Status TEST_CompactRange(
414+
int level, const Slice* begin, const Slice* end,
415+
ColumnFamilyHandle* column_family = nullptr,
416+
SeparationType separation_type = kCompactionTransToSeparate,
417+
bool disallow_trivial_move = false);
418418

419419
void TEST_SwitchWAL();
420420

@@ -806,6 +806,11 @@ class DBImpl : public DB {
806806

807807
void ScheduleTtlGC();
808808

809+
#ifdef LIBZBD
810+
// schedule GC by polling ZNS zone status
811+
void ScheduleZNSGC();
812+
#endif
813+
809814
protected:
810815
Env* const env_;
811816
const std::string dbname_;

db/db_impl_files.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,22 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
532532
to_delete;
533533
}
534534

535+
// TODO: Workaround for ZNS and Windows.
536+
// In TerarkDB, when a WAL is not needed, it is first deleted, and then
537+
// closed. This means that the underlying FS must support deferred delete.
538+
// In this case, we delete the writer before issuing delete to FS.
539+
if (type == kLogFile) {
540+
auto it =
541+
std::find_if(state.logs_to_free.begin(), state.logs_to_free.end(),
542+
[number](log::Writer* writer) {
543+
return writer->get_log_number() == number;
544+
});
545+
if (it != state.logs_to_free.end()) {
546+
delete *it;
547+
*it = nullptr;
548+
}
549+
}
550+
535551
#ifndef ROCKSDB_LITE
536552
if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
537553
immutable_db_options_.wal_size_limit_mb > 0)) {

db/periodic_work_scheduler.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ void PeriodicWorkScheduler::Register(DBImpl* dbi,
4545
initial_delay.fetch_add(1) % kDefaultScheduleGCTTLPeriodSec *
4646
kMicrosInSecond,
4747
kDefaultScheduleGCTTLPeriodSec * kMicrosInSecond);
48+
#ifdef LIBZBD
49+
timer->Add([dbi]() { dbi->ScheduleZNSGC(); },
50+
GetTaskName(dbi, "schedule_gc_zns"),
51+
initial_delay.fetch_add(1) % kDefaultScheduleZNSTTLPeriodSec *
52+
kMicrosInSecond,
53+
kDefaultScheduleZNSTTLPeriodSec * kMicrosInSecond);
54+
#endif
4855
}
4956

5057
void PeriodicWorkScheduler::Unregister(DBImpl* dbi) {
@@ -53,6 +60,9 @@ void PeriodicWorkScheduler::Unregister(DBImpl* dbi) {
5360
timer->Cancel(GetTaskName(dbi, "pst_st"));
5461
timer->Cancel(GetTaskName(dbi, "flush_info_log"));
5562
timer->Cancel(GetTaskName(dbi, "schedule_gc_ttl"));
63+
#ifdef LIBZBD
64+
timer->Cancel(GetTaskName(dbi, "schedule_gc_zns"));
65+
#endif
5666
if (!timer->HasPendingTask()) {
5767
timer->Shutdown();
5868
}

db/periodic_work_scheduler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class PeriodicWorkScheduler {
4141
// log.
4242
static const uint64_t kDefaultFlushInfoLogPeriodSec = 10;
4343
static const uint64_t kDefaultScheduleGCTTLPeriodSec = 10;
44+
static const uint64_t kDefaultScheduleZNSTTLPeriodSec = 10;
4445

4546
protected:
4647
std::unique_ptr<Timer> timer;

env/env_zenfs.cc

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#ifdef LIBZBD
66
#include "third-party/zenfs/fs/fs_zenfs.h"
7+
#include "third-party/zenfs/fs/zbd_stat.h"
78
#include "third-party/zenfs/fs/zbd_zenfs.h"
89

910
namespace TERARKDB_NAMESPACE {
@@ -166,9 +167,7 @@ class ZenfsDirectory : public Directory {
166167
explicit ZenfsDirectory(std::unique_ptr<FSDirectory>&& target)
167168
: target_(std::move(target)) {}
168169

169-
Status Fsync() override {
170-
return target_->Fsync(IOOptions(), nullptr);
171-
}
170+
Status Fsync() override { return target_->Fsync(IOOptions(), nullptr); }
172171
size_t GetUniqueId(char* id, size_t max_size) const override {
173172
return target_->GetUniqueId(id, max_size);
174173
}
@@ -469,14 +468,20 @@ class ZenfsEnv : public EnvWrapper {
469468
target_->SanitizeEnvOptions(env_opts);
470469
}
471470

472-
Status GetZbdDiskSpaceInfo(uint64_t &total_size, uint64_t &avail_size, uint64_t &used_size) {
471+
Status GetZbdDiskSpaceInfo(uint64_t& total_size, uint64_t& avail_size,
472+
uint64_t& used_size) {
473473
auto zbd = dynamic_cast<ZenFS*>(fs_)->GetZonedBlockDevice();
474474
used_size = zbd->GetUsedSpace() + zbd->GetReclaimableSpace();
475475
avail_size = zbd->GetFreeSpace();
476476
total_size = used_size + avail_size;
477477
return Status::OK();
478478
}
479479

480+
std::vector<ZoneStat> GetStat() {
481+
auto zen_fs = dynamic_cast<ZenFS*>(fs_);
482+
return zen_fs->GetStat();
483+
}
484+
480485
private:
481486
Env* target_;
482487
FileSystem* fs_;
@@ -490,8 +495,16 @@ Status NewZenfsEnv(Env** zenfs_env, const std::string& zdb_path) {
490495
return s;
491496
}
492497

493-
Status GetZbdDiskSpaceInfo(Env* env, uint64_t &total_size, uint64_t &avail_size, uint64_t &used_size) {
494-
return dynamic_cast<ZenfsEnv*>(env)->GetZbdDiskSpaceInfo(total_size, avail_size, used_size);
498+
Status GetZbdDiskSpaceInfo(Env* env, uint64_t& total_size, uint64_t& avail_size,
499+
uint64_t& used_size) {
500+
return dynamic_cast<ZenfsEnv*>(env)->GetZbdDiskSpaceInfo(
501+
total_size, avail_size, used_size);
502+
}
503+
504+
std::vector<ZoneStat> GetStat(Env* env) {
505+
auto zen_env = dynamic_cast<ZenfsEnv*>(env);
506+
if (!zen_env) return {};
507+
return zen_env->GetStat();
495508
}
496509

497510
} // namespace TERARKDB_NAMESPACE
@@ -505,11 +518,13 @@ Status NewZenfsEnv(Env** zenfs_env, const std::string& zdb_path) {
505518
return Status::NotSupported("ZenFSEnv is not implemented.");
506519
}
507520

508-
Status GetZbdDiskSpaceInfo(Env* env, uint64_t &total_size, uint64_t &avail_size, uint64_t &used_size) {
521+
Status GetZbdDiskSpaceInfo(Env* env, uint64_t& total_size, uint64_t& avail_size,
522+
uint64_t& used_size) {
509523
return Status::NotSupported("GetZbdDiskSpaceInfo is not implemented.");
510524
}
511525

526+
std::vector<ZoneStat> GetStat(Env* env) { return {}; }
527+
512528
} // namespace TERARKDB_NAMESPACE
513529

514530
#endif
515-

0 commit comments

Comments
 (0)