Skip to content

Commit 3d6898e

Browse files
committed
[o11y] Additional hardening against missing outcome event, STW optimizations
- Introduce a new WorkerTracer parameter indicating whether any BTWs are present. In a follow-up, this will be used to optimize memory management, but for now it helps us assert that if we have a tracer with logLevel none, we have BTWs (otherwise the tracer would be redundant, indicating waste). - Log an error if a WorkerTracer is destructed without getting the Outcome event even when logLevel == none
1 parent 23283e9 commit 3d6898e

File tree

3 files changed

+51
-42
lines changed

3 files changed

+51
-42
lines changed

src/workerd/io/tracer.c++

Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ kj::Own<WorkerTracer> PipelineTracer::makeWorkerTracer(PipelineLogLevel pipeline
9797
executionModel, kj::mv(durableObjectId));
9898
traces.add(kj::addRef(*trace));
9999
return kj::refcounted<WorkerTracer>(
100-
addRefToThis(), kj::mv(trace), pipelineLogLevel, kj::mv(maybeTailStreamWriter));
100+
addRefToThis(), kj::mv(trace), pipelineLogLevel, kj::mv(maybeTailStreamWriter), buffered);
101101
}
102102

103103
void PipelineTracer::addTrace(rpc::Trace::Reader reader) {
@@ -111,29 +111,29 @@ void PipelineTracer::addTailStreamWriter(kj::Own<tracing::TailStreamWriter>&& wr
111111
WorkerTracer::WorkerTracer(kj::Rc<PipelineTracer> parentPipeline,
112112
kj::Own<Trace> trace,
113113
PipelineLogLevel pipelineLogLevel,
114-
kj::Maybe<kj::Own<tracing::TailStreamWriter>> maybeTailStreamWriter)
114+
kj::Maybe<kj::Own<tracing::TailStreamWriter>> maybeTailStreamWriter,
115+
bool buffered)
115116
: pipelineLogLevel(pipelineLogLevel),
116117
trace(kj::mv(trace)),
117118
parentPipeline(kj::mv(parentPipeline)),
118-
maybeTailStreamWriter(kj::mv(maybeTailStreamWriter)) {}
119-
120-
WorkerTracer::WorkerTracer(PipelineLogLevel pipelineLogLevel, ExecutionModel executionModel)
121-
: pipelineLogLevel(pipelineLogLevel),
122-
trace(kj::refcounted<Trace>(
123-
kj::none, kj::none, kj::none, kj::none, kj::none, nullptr, kj::none, executionModel)) {}
119+
maybeTailStreamWriter(kj::mv(maybeTailStreamWriter)),
120+
buffered(buffered) {
121+
// If logLevel is none, we should not be creating an STW (we don't provide automated tracing in
122+
// that case, so the tail stream writer is redundant), and we should have BTW tracers available
123+
// (otherwise we should have skipped setting up a WorkerTracer entirely).
124+
if (pipelineLogLevel == PipelineLogLevel::NONE) {
125+
KJ_ASSERT(maybeTailStreamWriter == kj::none);
126+
KJ_ASSERT(buffered);
127+
}
128+
}
124129

125130
WorkerTracer::~WorkerTracer() noexcept(false) {
126131
// Report the outcome event, which should have been delivered by now.
127132

128-
// Do not attempt to report an outcome event if logging is disabled, as with other event types.
129-
if (pipelineLogLevel == PipelineLogLevel::NONE) {
130-
return;
131-
}
132-
133133
// Report the outcome event if STWs are present. All worker events need to call setEventInfo at
134134
// the start of the invocation to submit the onset event before any other tail events.
135-
KJ_IF_SOME(writer, maybeTailStreamWriter) {
136-
KJ_IF_SOME(spanContext, topLevelInvocationSpanContext) {
135+
KJ_IF_SOME(spanContext, topLevelInvocationSpanContext) {
136+
KJ_IF_SOME(writer, maybeTailStreamWriter) {
137137
if (isPredictableModeForTest()) {
138138
writer->report(spanContext,
139139
tracing::Outcome(trace->outcome, 0 * kj::MILLISECONDS, 0 * kj::MILLISECONDS),
@@ -142,18 +142,18 @@ WorkerTracer::~WorkerTracer() noexcept(false) {
142142
writer->report(spanContext,
143143
tracing::Outcome(trace->outcome, trace->cpuTime, trace->wallTime), completeTime);
144144
}
145-
} else {
146-
// If no span context is available, we have a streaming tail worker set up but shut down the
147-
// worker tracer without ever sending an Onset event. In that case we either failed to set up
148-
// the Onset properly (indicating a bug – all event types are required to report an Onset at
149-
// the start – although this is more likely to manifest as a "Tail stream onset was not
150-
// reported" error) or we created a WorkerInterface with WorkerTracer without ever invoking it
151-
// (which is not incorrect behavior, but likely indicates inefficient code that sets up
152-
// WorkerInterfaces and then ends up not using it due to an error/incorrect parameters; such
153-
// error checking should be done beforehand to avoid unused allocations). Report such cases.
154-
LOG_ERROR_PERIODICALLY(
155-
"destructed WorkerTracer with STW without reporting Onset event", kj::getStackTrace());
156145
}
146+
} else {
147+
// If no span context is available, we have a streaming tail worker set up but shut down the
148+
// worker tracer without ever sending an Onset event. In that case we either failed to set up
149+
// the Onset properly (indicating a bug – all event types are required to report an Onset at
150+
// the start – although this is more likely to manifest as a "Tail stream onset was not
151+
// reported" error) or we created a WorkerInterface with WorkerTracer without ever invoking it
152+
// (which is not incorrect behavior, but likely indicates inefficient code that sets up
153+
// WorkerInterfaces and then ends up not using it due to an error/incorrect parameters; such
154+
// error checking should be done beforehand to avoid unused allocations). Report such cases.
155+
LOG_ERROR_PERIODICALLY(
156+
"destructed WorkerTracer with STW without reporting Onset event", kj::getStackTrace());
157157
}
158158
};
159159

@@ -174,8 +174,8 @@ void WorkerTracer::addLog(const tracing::InvocationSpanContext& context,
174174
KJ_IF_SOME(writer, maybeTailStreamWriter) {
175175
// If message is too big on its own, truncate it.
176176
writer->report(context,
177-
{(tracing::Log(timestamp, logLevel,
178-
kj::str(message.first(kj::min(message.size(), MAX_TRACE_BYTES)))))},
177+
{tracing::Log(
178+
timestamp, logLevel, kj::str(message.first(kj::min(message.size(), MAX_TRACE_BYTES))))},
179179
timestamp);
180180
}
181181

@@ -196,14 +196,9 @@ void WorkerTracer::addLog(const tracing::InvocationSpanContext& context,
196196
}
197197

198198
void WorkerTracer::addSpan(tracing::CompleteSpan&& span) {
199-
// This is where we'll actually encode the span.
200-
if (pipelineLogLevel == PipelineLogLevel::NONE) {
201-
return;
202-
}
203-
204-
// Note: spans are not available in the buffered tail worker, so we don't need an exceededSpanLimit
205-
// variable for it and it can't cause truncation.
206-
auto& tailStreamWriter = KJ_UNWRAP_OR_RETURN(maybeTailStreamWriter);
199+
// This is where we'll actually encode the span. This function should never be invoked if STW is
200+
// inactive as span tracing is only used in STW.
201+
auto& tailStreamWriter = KJ_ASSERT_NONNULL(maybeTailStreamWriter);
207202

208203
adjustSpanTime(span);
209204

@@ -346,6 +341,9 @@ void WorkerTracer::setEventInfoInternal(
346341
const tracing::InvocationSpanContext& context, kj::Date timestamp, tracing::EventInfo&& info) {
347342
KJ_ASSERT(trace->eventInfo == kj::none, "tracer can only be used for a single event");
348343

344+
// Always set up span context to indicate that the Onset has been set.
345+
this->topLevelInvocationSpanContext = context.clone();
346+
349347
// TODO(someday): For now, we're using logLevel == none as a hint to avoid doing anything
350348
// expensive while tracing. We may eventually want separate configuration for event info vs.
351349
// logs.
@@ -356,7 +354,6 @@ void WorkerTracer::setEventInfoInternal(
356354
}
357355

358356
trace->eventTimestamp = timestamp;
359-
this->topLevelInvocationSpanContext = context.clone();
360357

361358
size_t eventSize = 0;
362359
KJ_SWITCH_ONEOF(info) {
@@ -519,7 +516,7 @@ void WorkerTracer::setReturn(
519516
}
520517

521518
KJ_IF_SOME(writer, maybeTailStreamWriter) {
522-
auto& spanContext = KJ_UNWRAP_OR_RETURN(topLevelInvocationSpanContext);
519+
auto& spanContext = KJ_ASSERT_NONNULL(topLevelInvocationSpanContext);
523520

524521
// Fall back to weak IoContext if no timestamp is available
525522
writer->report(spanContext,

src/workerd/io/tracer.h

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class WorkerTracer;
4848
class PipelineTracer: public kj::Refcounted {
4949
public:
5050
// Creates a pipeline tracer (with a possible parent).
51-
explicit PipelineTracer() = default;
51+
PipelineTracer(bool buffered): buffered(buffered) {};
5252
virtual ~PipelineTracer() noexcept(false);
5353
KJ_DISALLOW_COPY_AND_MOVE(PipelineTracer);
5454

@@ -79,6 +79,15 @@ class PipelineTracer: public kj::Refcounted {
7979

8080
void addTailStreamWriter(kj::Own<tracing::TailStreamWriter>&& writer);
8181

82+
inline bool hasBuffered() {
83+
return buffered;
84+
}
85+
86+
protected:
87+
// Indicates that buffered tail workers may be present. There may also be streaming tail workers
88+
// present at the same time.
89+
bool buffered;
90+
8291
private:
8392
kj::Vector<kj::Own<Trace>> traces;
8493
kj::Maybe<kj::Own<kj::PromiseFulfiller<kj::Array<kj::Own<Trace>>>>> completeFulfiller;
@@ -174,8 +183,8 @@ class WorkerTracer final: public BaseTracer {
174183
explicit WorkerTracer(kj::Rc<PipelineTracer> parentPipeline,
175184
kj::Own<Trace> trace,
176185
PipelineLogLevel pipelineLogLevel,
177-
kj::Maybe<kj::Own<tracing::TailStreamWriter>> maybeTailStreamWriter);
178-
explicit WorkerTracer(PipelineLogLevel pipelineLogLevel, ExecutionModel executionModel);
186+
kj::Maybe<kj::Own<tracing::TailStreamWriter>> maybeTailStreamWriter,
187+
bool buffered);
179188
virtual ~WorkerTracer() noexcept(false);
180189
KJ_DISALLOW_COPY_AND_MOVE(WorkerTracer);
181190

@@ -233,5 +242,8 @@ class WorkerTracer final: public BaseTracer {
233242
kj::Maybe<kj::Rc<PipelineTracer>> parentPipeline;
234243

235244
kj::Maybe<kj::Own<tracing::TailStreamWriter>> maybeTailStreamWriter;
245+
// Whether any BTWs are present. Note that this serves as an optimization – it is legal for BTWs
246+
// to be absent even when this is true.
247+
bool buffered;
236248
};
237249
} // namespace workerd

src/workerd/server/server.c++

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2167,7 +2167,7 @@ class Server::WorkerService final: public Service,
21672167
if (legacyTailWorkers.size() > 0 || streamingTailWorkers.size() > 0) {
21682168
// Setting up buffered tail workers support, but only if we actually have tail workers
21692169
// configured.
2170-
auto tracer = kj::rc<PipelineTracer>();
2170+
auto tracer = kj::rc<PipelineTracer>(legacyTailWorkers.size() > 0);
21712171
auto executionModel =
21722172
actor == kj::none ? ExecutionModel::STATELESS : ExecutionModel::DURABLE_OBJECT;
21732173
auto tailStreamWriter = tracing::initializeTailStreamWriter(

0 commit comments

Comments
 (0)