|
68 | 68 | #include "table/block_based_table_factory.h" |
69 | 69 | #include "table/merging_iterator.h" |
70 | 70 | #include "table/two_level_iterator.h" |
| 71 | +#include "third-party/zenfs/fs/zbd_stat.h" |
71 | 72 | #include "util/autovector.h" |
72 | 73 | #include "util/build_version.h" |
73 | 74 | #include "util/c_style_callback.h" |
|
113 | 114 |
|
114 | 115 | namespace TERARKDB_NAMESPACE { |
115 | 116 |
|
| 117 | +std::vector<ZoneStat> GetStat(Env* env); |
| 118 | + |
116 | 119 | const std::string kDefaultColumnFamilyName("default"); |
117 | 120 | const uint64_t kDumpStatsWaitMicroseconds = 10000; |
118 | 121 | const std::string kPersistentStatsColumnFamilyName( |
@@ -985,6 +988,163 @@ void DBImpl::ScheduleTtlGC() { |
985 | 988 | log_buffer_debug.FlushBufferToLog(); |
986 | 989 | } |
987 | 990 |
|
| 991 | +#ifdef LIBZBD |
| 992 | +void DBImpl::ScheduleZNSGC() { |
| 993 | + TEST_SYNC_POINT("DBImpl:ScheduleZNSGC"); |
| 994 | + uint64_t nowSeconds = env_->NowMicros() / 1000U / 1000U; |
| 995 | + LogBuffer log_buffer_info(InfoLogLevel::INFO_LEVEL, |
| 996 | + immutable_db_options_.info_log.get()); |
| 997 | + LogBuffer log_buffer_debug(InfoLogLevel::DEBUG_LEVEL, |
| 998 | + immutable_db_options_.info_log.get()); |
| 999 | + |
| 1000 | + chash_set<uint64_t> mark_for_gc; |
| 1001 | + |
| 1002 | + if (initial_db_options_.zenfs_gc_ratio <= 0.0 || |
| 1003 | + initial_db_options_.zenfs_gc_ratio >= 1.0) { |
| 1004 | + // GC is not enabled |
| 1005 | + return; |
| 1006 | + } |
| 1007 | + |
| 1008 | + // Pick files for GC |
| 1009 | + auto stat = GetStat(env_); |
| 1010 | + |
| 1011 | + uint64_t number; |
| 1012 | + FileType type; |
| 1013 | + |
| 1014 | + // Merge db paths and column family paths together |
| 1015 | + chash_set<std::string> db_paths; |
| 1016 | + |
| 1017 | + // Get column family paths |
| 1018 | + mutex_.Lock(); |
| 1019 | + for (auto cfd : *versions_->GetColumnFamilySet()) { |
| 1020 | + for (const auto& path : cfd->ioptions()->db_paths) { |
| 1021 | + db_paths.emplace(path.path); |
| 1022 | + } |
| 1023 | + } |
| 1024 | + mutex_.Unlock(); |
| 1025 | + |
| 1026 | + // Get database paths |
| 1027 | + for (const auto& path : immutable_db_options_.db_paths) { |
| 1028 | + db_paths.emplace(path.path); |
| 1029 | + } |
| 1030 | + |
| 1031 | + std::string strip_filename; |
| 1032 | + |
| 1033 | + for (const auto& zone : stat) { |
| 1034 | + std::vector<uint64_t> sst_in_zone; |
| 1035 | + uint64_t written_data = zone.write_position - zone.start_position; |
| 1036 | + // zone is full |
| 1037 | + if (written_data == zone.total_capacity) { |
| 1038 | + uint64_t total_size = 0; |
| 1039 | + bool ignore_zone = false; |
| 1040 | + for (const auto& file : zone.files) { |
| 1041 | + strip_filename.clear(); |
| 1042 | + |
| 1043 | + for (const auto& path : db_paths) { |
| 1044 | + if (Slice(file.filename).starts_with(path)) { |
| 1045 | + strip_filename.assign(file.filename, path.length(), |
| 1046 | + file.filename.length() - path.length()); |
| 1047 | + break; |
| 1048 | + } |
| 1049 | + } |
| 1050 | + |
| 1051 | + if (strip_filename.empty()) { |
| 1052 | + // This file is not in DB folder. |
| 1053 | + ignore_zone = true; |
| 1054 | + break; |
| 1055 | + } |
| 1056 | + |
| 1057 | + if (ParseFileName(strip_filename, &number, Slice(), &type)) { |
| 1058 | + // Is SST file, and is of current TerarkDB instance. |
| 1059 | + if (type == kTableFile) { |
| 1060 | + total_size += file.size_in_zone; |
| 1061 | + sst_in_zone.push_back(number); |
| 1062 | + } else { |
| 1063 | + // This zone contains file other than SSTs or files from other |
| 1064 | + // databases. We ignore the zone for now. When other files (like |
| 1065 | + // logs) have been deleted, we will come back and recycle this zone. |
| 1066 | + ignore_zone = true; |
| 1067 | + break; |
| 1068 | + } |
| 1069 | + } else { |
| 1070 | + // This file is not recognized by TerarkDB (or RocksDB). Even if we |
| 1071 | + // move the file, the zone may not be reset. Therefore, we simply |
| 1072 | + // ignore this zone. |
| 1073 | + ignore_zone = true; |
| 1074 | + break; |
| 1075 | + } |
| 1076 | + } |
| 1077 | + |
| 1078 | + if (ignore_zone) { |
| 1079 | + continue; |
| 1080 | + } |
| 1081 | + |
| 1082 | + // if data in zone <= (1 - ratio) * total_capacity, recycle the zone |
| 1083 | + if (total_size <= |
| 1084 | + (1.0 - initial_db_options_.zenfs_gc_ratio) * written_data) { |
| 1085 | + for (auto&& file_id : sst_in_zone) { |
| 1086 | + mark_for_gc.insert(file_id); |
| 1087 | + } |
| 1088 | + } |
| 1089 | + } |
| 1090 | + } |
| 1091 | + |
| 1092 | + mutex_.Lock(); |
| 1093 | + for (auto cfd : *versions_->GetColumnFamilySet()) { |
| 1094 | + uint64_t new_mark_count = 0; |
| 1095 | + uint64_t old_mark_count = 0; |
| 1096 | + uint64_t total_count = 0; |
| 1097 | + if (!cfd->initialized() || cfd->IsDropped()) { |
| 1098 | + continue; |
| 1099 | + } |
| 1100 | + VersionStorageInfo* vstorage = cfd->current()->storage_info(); |
| 1101 | + // Level -1 contains SSTs inside lazy compaction SST index. |
| 1102 | + // By iterating level -1, we could collect that kind of garbage. |
| 1103 | + // But we still recommend using ZNS GC without lazy compaction |
| 1104 | + // enabled. |
| 1105 | + for (int l = -1; l < vstorage->num_non_empty_levels(); l++) { |
| 1106 | + for (auto meta : vstorage->LevelFiles(l)) { |
| 1107 | + if (meta->being_compacted) { |
| 1108 | + continue; |
| 1109 | + } |
| 1110 | + ++total_count; |
| 1111 | + old_mark_count += meta->marked_for_compaction; |
| 1112 | + TEST_SYNC_POINT("DBImpl:Exist-SST"); |
| 1113 | + if (!meta->marked_for_compaction && |
| 1114 | + mark_for_gc.count(meta->fd.GetNumber()) > 0) { |
| 1115 | + meta->marked_for_compaction = true; |
| 1116 | + } |
| 1117 | + if (meta->marked_for_compaction) { |
| 1118 | + new_mark_count++; |
| 1119 | + TEST_SYNC_POINT("DBImpl:ScheduleZNSGC-mark"); |
| 1120 | + } |
| 1121 | + } |
| 1122 | + } |
| 1123 | + if (new_mark_count > old_mark_count) { |
| 1124 | + vstorage->ComputeCompactionScore(*cfd->ioptions(), |
| 1125 | + *cfd->GetLatestMutableCFOptions()); |
| 1126 | + if (!cfd->queued_for_compaction()) { |
| 1127 | + AddToCompactionQueue(cfd); |
| 1128 | + unscheduled_compactions_++; |
| 1129 | + } |
| 1130 | + } |
| 1131 | + if (old_mark_count != 0 && new_mark_count != 0) { |
| 1132 | + ROCKS_LOG_BUFFER(&log_buffer_info, |
| 1133 | + "[%s] ZNS GC: SSTs total marked = %" PRIu64 |
| 1134 | + ", new marked = %" PRIu64 ", file count: %" PRIu64, |
| 1135 | + cfd->GetName().c_str(), old_mark_count, new_mark_count, |
| 1136 | + total_count); |
| 1137 | + } |
| 1138 | + } |
| 1139 | + if (unscheduled_compactions_ > 0) { |
| 1140 | + MaybeScheduleFlushOrCompaction(); |
| 1141 | + } |
| 1142 | + mutex_.Unlock(); |
| 1143 | + log_buffer_info.FlushBufferToLog(); |
| 1144 | + log_buffer_debug.FlushBufferToLog(); |
| 1145 | +} |
| 1146 | +#endif |
| 1147 | + |
988 | 1148 | void DBImpl::DumpStats() { |
989 | 1149 | TEST_SYNC_POINT("DBImpl::DumpStats:1"); |
990 | 1150 | #ifndef ROCKSDB_LITE |
|
0 commit comments