Skip to content

Commit 5248fa6

Browse files
Priyank Warkhedemeta-codesync[bot]
authored andcommitted
Reduce fsdb server queuing for stats subscriptions
Summary: In fsdb server, limit stats subscription queue size to 8 in order to limit server-side memory usage due to subscription serve queue in case of slow subscribers. Rationale for choice of default value: queue size 8 provides server-side queueing to hold messages enqueued in 1 min interval (6 stats updates and 2 heartbeats). If subscriber is backlogged for longer than 1 min, fsdb server will disconnect the subscription with disconnect reason SLOW_SUBSCRIBER_QUEUE_FULL In prod the p99 latency observed for NSDB stats processing is 37 seconds, which is well within the latency bound indicated by this queue size change. So this change is not expected to cause disconnects for subscriptions that are normally working as of now, while protecting fsdb server from memory pressure due to slow stats subscriber. Differential Revision: D85967896 fbshipit-source-id: ae99ddc24ad2de6bfb2e92091e7cd85974029505
1 parent 5836d54 commit 5248fa6

File tree

6 files changed

+126
-8
lines changed

6 files changed

+126
-8
lines changed

fboss/fsdb/server/ServiceHandler.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ DEFINE_int32(
3939
10000,
4040
"Interval at which stats subscriptions are served");
4141

42+
// queue size for serving stats subscriptions is chosen
43+
// to accommodate pending updates generated over 1 min interval
44+
// (6 updates + 2 heartbeats).
45+
// This limits memory usage on server due to subscription serve
46+
// queue while not disconnecting subscriber aggressively.
47+
DEFINE_int32(
48+
statsSubscriptionServeQueueSize,
49+
8,
50+
"Max stats subscription updates to hold in server queue");
51+
4252
DEFINE_int32(
4353
statsSubscriptionHeartbeat_s,
4454
30,
@@ -272,7 +282,10 @@ ServiceHandler::ServiceHandler(
272282
"stats",
273283
options_.serveIdPathSubs,
274284
true,
275-
true)) {
285+
true,
286+
true /* serveGetRequestsWithLastPublishedState */,
287+
FLAGS_statsSubscriptionServeQueueSize /* pathSubscriptionServeQueueSize */,
288+
FLAGS_statsSubscriptionServeQueueSize /* defaultSubscriptionServeQueueSize */)) {
276289
num_instances_.incrementValue(1);
277290

278291
initPerStreamCounters();

fboss/fsdb/server/ServiceHandler.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include "fboss/lib/ThreadHeartbeat.h"
2020
#include "re2/re2.h"
2121

22+
DECLARE_int32(statsSubscriptionServeQueueSize);
23+
2224
DECLARE_bool(checkSubscriberConfig);
2325
DECLARE_bool(enforceSubscriberConfig);
2426
DECLARE_bool(checkOperOwnership);

fboss/fsdb/tests/client/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ cpp_unittest(
3939
"//fboss/fsdb/client:fsdb_stream_client",
4040
"//fboss/fsdb/if:fsdb_model",
4141
"//fboss/fsdb/oper:extended_path_builder",
42+
"//fboss/fsdb/server:handler",
4243
"//fboss/fsdb/tests/utils:fsdb_test_server",
4344
"//fboss/lib:common_utils",
4445
"//fboss/lib/thrift_service_client:thrift-service-client",

fboss/fsdb/tests/client/FsdbPubSubTest.cpp

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <gtest/gtest.h>
44
#include "fboss/fsdb/oper/ExtendedPathBuilder.h"
5+
#include "fboss/fsdb/server/ServiceHandler.h"
56
#include "fboss/fsdb/tests/client/FsdbTestClients.h"
67
#include "fboss/fsdb/tests/utils/FsdbTestServer.h"
78
#include "fboss/lib/CommonUtils.h"
@@ -18,6 +19,7 @@ namespace {
1819

1920
const uint32_t kStateServeIntervalMs = 50;
2021
const uint32_t kStatsServeIntervalMs = 50;
22+
const uint32_t kSubscriptionServeQueueSize = 10;
2123
auto constexpr kSubscriberId = "fsdb_test_subscriber";
2224
auto constexpr kPublisherId = "fsdb_test_publisher";
2325
auto constexpr kUnknownPublisherId = "publisher_unknown";
@@ -45,7 +47,11 @@ class FsdbPubSubTest : public ::testing::Test {
4547
folly::LoggerDB::get().setLevel("fboss.fsdb", folly::LogLevel::DBG4);
4648
auto config = getFsdbConfig();
4749
fsdbTestServer_ = std::make_unique<FsdbTestServer>(
48-
std::move(config), 0, kStateServeIntervalMs, kStatsServeIntervalMs);
50+
std::move(config),
51+
0,
52+
kStateServeIntervalMs,
53+
kStatsServeIntervalMs,
54+
kSubscriptionServeQueueSize);
4955
publisherStreamEvbThread_ =
5056
std::make_unique<folly::ScopedEventBaseThread>();
5157
subscriberStreamEvbThread_ =
@@ -442,9 +448,91 @@ TYPED_TEST(FsdbPubSubTest, dupSubscriber) {
442448
EXPECT_NO_THROW({ auto res2 = this->subscribe(req); });
443449
}
444450

445-
TYPED_TEST(FsdbPubSubTest, slowSubscriber) {
446-
FLAGS_subscriptionServeQueueSize = 2;
451+
TYPED_TEST(FsdbPubSubTest, slowSubscriberDisconnectThreshold) {
452+
// verify threshold for number of pending updates for slow subscriber
453+
// disconnect
454+
455+
uint32_t queueSize = this->pubSubStats()
456+
? FLAGS_statsSubscriptionServeQueueSize
457+
: kSubscriptionServeQueueSize;
458+
uint32_t updatesPublished = 100 + queueSize;
459+
uint32_t subscriptionServeIntervalMs =
460+
this->pubSubStats() ? kStatsServeIntervalMs : kStateServeIntervalMs;
461+
uint32_t publishIntervalMs = subscriptionServeIntervalMs + 20;
462+
this->setupConnection(*this->publisher_, false);
463+
this->checkPublishing({this->publisher_->clientId()});
447464

465+
// pause subscriber on initial sync long enough for all updates to be
466+
// published and served so that queue builds up.
467+
folly::Baton<> waitForInitialSync, waitForDisconnect;
468+
folly::Baton<> resumeDataCb, resumeReconnect;
469+
bool initialSyncOnce{false}, disconnectOnce{false};
470+
auto slowSub = this->createSubscriber(
471+
"fsdb_slow_subscriber",
472+
[&waitForInitialSync, &resumeDataCb, &initialSyncOnce]() {
473+
if (!initialSyncOnce) {
474+
initialSyncOnce = true;
475+
waitForInitialSync.post();
476+
resumeDataCb.wait();
477+
}
478+
},
479+
[&waitForDisconnect, &resumeReconnect, &disconnectOnce]() {
480+
if (!disconnectOnce) {
481+
disconnectOnce = true;
482+
waitForDisconnect.post();
483+
resumeReconnect.wait();
484+
}
485+
});
486+
this->setupConnection(*slowSub);
487+
this->checkSubscribed({slowSub->clientId()});
488+
int updateNum{0};
489+
for (; updateNum < updatesPublished; updateNum++) {
490+
if (this->pubSubStats()) {
491+
this->publishPortStats(makePortStats(updateNum));
492+
} else {
493+
std::string testStr = folly::to<std::string>("bar", updateNum);
494+
this->publishAgentConfig(makeAgentConfig({{"foo", testStr}}));
495+
}
496+
std::this_thread::sleep_for(std::chrono::milliseconds(publishIntervalMs));
497+
}
498+
499+
// validate server does not disconnect subscriber yet
500+
waitForInitialSync.wait();
501+
SubscriptionInfo info = slowSub->getInfo();
502+
ASSERT_EQ(
503+
info.state, FsdbStreamClient::ReconnectingThriftClient::State::CONNECTED);
504+
505+
// post another update and validate that server disconnects the slow
506+
// subscriber
507+
if (this->pubSubStats()) {
508+
this->publishPortStats(makePortStats(updateNum));
509+
} else {
510+
std::string testStr = folly::to<std::string>("bar", updateNum);
511+
this->publishAgentConfig(makeAgentConfig({{"foo", testStr}}));
512+
}
513+
std::this_thread::sleep_for(std::chrono::milliseconds(publishIntervalMs));
514+
515+
// resume subscriber data callback after all updates are published
516+
resumeDataCb.post();
517+
518+
waitForDisconnect.wait();
519+
info = slowSub->getInfo();
520+
ASSERT_EQ(
521+
info.disconnectReason, FsdbErrorCode::SUBSCRIPTION_SERVE_QUEUE_FULL);
522+
523+
WITH_RETRIES_N(90, {
524+
fb303::ThreadCachedServiceData::get()->publishStats();
525+
auto counterName = folly::sformat(
526+
"{}.subscriber.{}.disconnects.slow_subscriber.count",
527+
this->pubSubStats() ? "stats" : "fsdb",
528+
"unspecified");
529+
EXPECT_EVENTUALLY_GT(fb303::ServiceData::get()->getCounter(counterName), 0);
530+
});
531+
532+
resumeReconnect.post();
533+
}
534+
535+
TYPED_TEST(FsdbPubSubTest, slowSubscriber) {
448536
// publishInterval: wait for subscriptionServeIntervalMs+delta to prevent
449537
// published updates from being coalesced
450538
uint32_t updatesPublished = 120;
@@ -486,7 +574,6 @@ TYPED_TEST(FsdbPubSubTest, slowSubscriber) {
486574
std::this_thread::sleep_for(std::chrono::milliseconds(publishIntervalMs));
487575
}
488576

489-
// TODO: validate subscription serve queue watermark counter
490577
// resume subscriber data callback after all updates are published
491578
resumeDataCb.post();
492579

@@ -513,7 +600,6 @@ TYPED_TEST(FsdbPubSubTest, slowSubscriber) {
513600
}
514601

515602
TYPED_TEST(FsdbPubSubTest, slowSubscriberQueueWatermark) {
516-
FLAGS_subscriptionServeQueueSize = 100;
517603
FLAGS_forceCloseSlowSubscriber = false;
518604

519605
// publishInterval: wait for subscriptionServeIntervalMs+delta to prevent

fboss/fsdb/tests/utils/FsdbTestServer.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,21 @@ FsdbTestServer::FsdbTestServer(
1212
std::shared_ptr<FsdbConfig> config,
1313
uint16_t port,
1414
uint32_t stateSubscriptionServe_ms,
15-
uint32_t statsSubscriptionServe_ms) {
15+
uint32_t statsSubscriptionServe_ms,
16+
uint32_t subscriptionServeQueueSize,
17+
uint32_t statsSubscriptionServeQueueSize) {
18+
auto queueSize = std::to_string(subscriptionServeQueueSize);
19+
gflags::SetCommandLineOptionWithMode(
20+
"subscriptionServeQueueSize",
21+
queueSize.c_str(),
22+
gflags::SET_FLAG_IF_DEFAULT);
23+
24+
auto statsQueueSize = std::to_string(statsSubscriptionServeQueueSize);
25+
gflags::SetCommandLineOptionWithMode(
26+
"statsSubscriptionServeQueueSize",
27+
statsQueueSize.c_str(),
28+
gflags::SET_FLAG_IF_DEFAULT);
29+
1630
// Run tests faster
1731
auto stateServeInterval = std::to_string(stateSubscriptionServe_ms);
1832
auto statsServeInterval = std::to_string(statsSubscriptionServe_ms);

fboss/fsdb/tests/utils/FsdbTestServer.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ class FsdbTestServer {
2727
std::shared_ptr<FsdbConfig> config,
2828
uint16_t port = 0,
2929
uint32_t stateSubscriptionServe_ms = 50,
30-
uint32_t statsSubscriptionServe_ms = 1000);
30+
uint32_t statsSubscriptionServe_ms = 1000,
31+
uint32_t subscriptionServeQueueSize = 1000,
32+
uint32_t statsSubscriptionServeQueueSize = 8);
3133
~FsdbTestServer();
3234

3335
uint16_t getFsdbPort() const {

0 commit comments

Comments
 (0)