Skip to content

Commit feab016

Browse files
fix: Add OTEL Kafka logs definition for broker and cluster stg entities (#2480)
* fix: Fixed otel topic to apm rel * fix: Add OTEL Kafka logs ent defs * fix: Make aggre func consistent on otel kafka broker golden metrics * fix: Added log test files * fix: Added log test files * fix: Removed trace defs * fix: Dashboard corrections * fix: Bug fixes * fix: Bug fixes * fix: Bug fixes
1 parent e1ce11d commit feab016

16 files changed

+306
-157
lines changed

entity-types/infra-kafkabroker/definition.stg.yml

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,79 @@ type: KAFKABROKER
44
synthesis:
55
rules:
66
# This rule is for OTEL kafka receivers metrics
7-
- ruleName: infra_kafkabroker_kafka_broker_name
8-
identifier: kafka.broker.name
9-
name: kafka.broker.name
7+
- ruleName: infra_kafkabroker_kafka_broker_name_stg_1
8+
compositeIdentifier:
9+
separator: ":"
10+
attributes:
11+
- broker.id
12+
- kafka.cluster.name
13+
compositeName:
14+
fragments:
15+
- value: "BrokerId: "
16+
- attribute: broker.id
17+
- value: " ("
18+
- attribute: kafka.cluster.name
19+
- value: ")"
1020
encodeIdentifierInGUID: true
1121
conditions:
1222
- attribute: eventType
1323
value: Metric
1424
- attribute: instrumentation.provider
1525
value: opentelemetry
16-
- attribute: kafka.broker.name
26+
- attribute: broker.id
27+
present: true
28+
- attribute: kafka.cluster.name
1729
present: true
1830
- attribute: topic
1931
present: false
2032
tags:
2133
# Environment resource attributes
22-
host.name:
23-
entityTagName: hostName
2434
instrumentation.provider:
2535
kafka.cluster.name:
26-
entityTagName: clusterName
2736
otel.library.name:
2837
entityTagName: instrumentation.name
2938
broker.id:
30-
entityTagName: brokerId
3139
k8s.pod.name:
3240
k8s.node.name:
3341
k8s.pod.uid:
3442
k8s.namespace.name:
3543

44+
# This rule is for OTEL kafka broker logs
45+
- ruleName: infra_kafkabroker_kafka_broker_name_stg_2
46+
compositeIdentifier:
47+
separator: ":"
48+
attributes:
49+
- broker.id
50+
- kafka.cluster.name
51+
compositeName:
52+
fragments:
53+
- value: "BrokerId: "
54+
- attribute: broker.id
55+
- value: " ("
56+
- attribute: kafka.cluster.name
57+
- value: ")"
58+
encodeIdentifierInGUID: true
59+
conditions:
60+
- attribute: eventType
61+
value: Log
62+
- attribute: instrumentation.provider
63+
value: opentelemetry
64+
- attribute: topic
65+
present: false
66+
- attribute: broker.id
67+
present: true
68+
- attribute: kafka.cluster.name
69+
present: true
70+
tags:
71+
instrumentation.provider:
72+
kafka.cluster.name:
73+
otel.library.name:
74+
entityTagName: instrumentation.name
75+
broker.id:
76+
k8s.pod.name:
77+
k8s.node.name:
78+
k8s.pod.uid:
79+
k8s.namespace.name:
3680

3781
dashboardTemplates:
3882
# This should match the entity created from the ohi in the infra pipeline
@@ -44,7 +88,9 @@ dashboardTemplates:
4488

4589
goldenTags:
4690
- clusterName
47-
- hostName
91+
- broker.id
92+
- brokerId
93+
- kafka.cluster.name
4894

4995
configuration:
5096
entityExpirationTime: DAILY

entity-types/infra-kafkabroker/golden_metrics.stg.yml

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
incomingMessagesPerSecond:
2-
title: Incoming messages per second
2+
title: Messages in per second
33
unit: OPERATIONS_PER_SECOND
44
queries:
55
newRelic:
@@ -28,23 +28,38 @@ bytesInPerSecond:
2828
where: metricName = 'kafka.network.io' AND (state = 'in' OR direction = 'in')
2929
eventId: entity.guid
3030

31+
bytesOutSecond:
32+
title: Bytes out per second
33+
unit: BYTES_PER_SECOND
34+
queries:
35+
newRelic:
36+
select: average(broker.IOOutPerSecond)
37+
from: KafkaBrokerSample
38+
eventId: entityGuid
39+
eventName: entityName
40+
opentelemetry:
41+
select: sum(kafka.network.io) / sum((endTimestamp - timestamp) / 1000)
42+
from: Metric
43+
where: metricName = 'kafka.network.io' AND (state = 'out' OR direction = 'out')
44+
eventId: entity.guid
45+
3146
underReplicatedPartitions:
3247
title: Under replicated partitions
3348
unit: COUNT
3449
queries:
3550
newRelic:
36-
select: latest(replication.unreplicatedPartitions)
51+
select: max(replication.unreplicatedPartitions)
3752
from: KafkaBrokerSample
3853
eventId: entityGuid
3954
eventName: entityName
4055
opentelemetry:
41-
select: latest(kafka.partition.under_replicated)
56+
select: max(kafka.partition.under_replicated)
4257
from: Metric
4358
where: metricName = 'kafka.partition.under_replicated'
4459
eventId: entity.guid
4560

4661
produceRequestLatency99p:
47-
title: Produce request latency (99th percentile)
62+
title: Produce request latency (p99)
4863
unit: MS
4964
queries:
5065
newRelic:
@@ -58,8 +73,8 @@ produceRequestLatency99p:
5873
where: metricName = 'kafka.request.time.99p' AND type = 'Produce'
5974
eventId: entity.guid
6075

61-
failedRequestsPerSecond:
62-
title: Failed requests per second
76+
produceRequestsFailedPerSecond:
77+
title: Failed produce requests per second
6378
unit: OPERATIONS_PER_SECOND
6479
queries:
6580
newRelic:
@@ -70,5 +85,5 @@ failedRequestsPerSecond:
7085
opentelemetry:
7186
select: sum(kafka.request.failed) / sum((endTimestamp - timestamp) / 1000)
7287
from: Metric
73-
where: metricName = 'kafka.request.failed'
88+
where: metricName = 'kafka.request.failed' AND type = 'produce'
7489
eventId: entity.guid

entity-types/infra-kafkabroker/opentelemetry_dashboard.stg.json

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"description": "Performance and health metrics for a single Kafka Broker",
44
"pages": [
55
{
6-
"name": "Broker Overview",
6+
"name": "Broker overview",
77
"widgets": [
88
{
99
"title": "Broker details",
@@ -20,13 +20,13 @@
2020
"nrqlQueries": [
2121
{
2222
"accountId": 0,
23-
"query": "SELECT latest(kafka.partition.count) as 'Partitions on this broker' FROM Metric WHERE metricName='kafka.partition.count' facet kafka.cluster.name as 'Kafka cluster name', broker.id as 'Broker Id' limit 1"
23+
"query": "SELECT latest(kafka.partition.count) as 'Partitions on this broker' FROM Metric WHERE metricName='kafka.partition.count' facet kafka.cluster.name as 'Kafka cluster name', broker.id as 'Broker ID' limit 1"
2424
}
2525
]
2626
}
2727
},
2828
{
29-
"title": "Messages In Per Second",
29+
"title": "Messages in per second",
3030
"layout": {
3131
"column": 1,
3232
"row": 2,
@@ -40,13 +40,13 @@
4040
"nrqlQueries": [
4141
{
4242
"accountId": 0,
43-
"query": "SELECT sum(kafka.message.count) / sum((endTimestamp - timestamp) / 1000) AS `Messages In per Second` FROM Metric WHERE metricName = 'kafka.message.count' TIMESERIES AUTO"
43+
"query": "SELECT sum(kafka.message.count) / sum((endTimestamp - timestamp) / 1000) AS `Messages in per second` FROM Metric WHERE metricName = 'kafka.message.count' TIMESERIES AUTO"
4444
}
4545
]
4646
}
4747
},
4848
{
49-
"title": "Bytes In and Out",
49+
"title": "Bytes in and out",
5050
"layout": {
5151
"column": 7,
5252
"row": 2,
@@ -60,13 +60,13 @@
6060
"nrqlQueries": [
6161
{
6262
"accountId": 0,
63-
"query": "SELECT filter(sum(kafka.network.io) /sum((endTimestamp - timestamp) / 1000), WHERE (state = 'in' or direction = 'in')) AS `Bytes In`, filter(sum(kafka.network.io) / sum((endTimestamp - timestamp) / 1000), WHERE (state = 'out' or direction = 'out')) AS `Bytes Out` FROM Metric WHERE metricName = 'kafka.network.io' TIMESERIES AUTO"
63+
"query": "SELECT filter(sum(kafka.network.io) /sum((endTimestamp - timestamp) / 1000), WHERE (state = 'in' or direction = 'in')) AS `Bytes in`, filter(sum(kafka.network.io) / sum((endTimestamp - timestamp) / 1000), WHERE (state = 'out' or direction = 'out')) AS `Bytes out` FROM Metric WHERE metricName = 'kafka.network.io' TIMESERIES AUTO"
6464
}
6565
]
6666
}
6767
},
6868
{
69-
"title": "Under Replicated Partitions",
69+
"title": "Under replicated partitions",
7070
"layout": {
7171
"column": 1,
7272
"row": 5,
@@ -80,13 +80,13 @@
8080
"nrqlQueries": [
8181
{
8282
"accountId": 0,
83-
"query": "SELECT max(kafka.partition.under_replicated) AS 'Under Replicated Partitions' FROM Metric WHERE metricName='kafka.partition.under_replicated' TIMESERIES AUTO"
83+
"query": "SELECT max(kafka.partition.under_replicated) AS 'Under replicated partitions' FROM Metric WHERE metricName='kafka.partition.under_replicated' TIMESERIES AUTO"
8484
}
8585
]
8686
}
8787
},
8888
{
89-
"title": "Under Min ISR Partitions",
89+
"title": "Under min ISR partitions",
9090
"layout": {
9191
"column": 7,
9292
"row": 5,
@@ -100,13 +100,13 @@
100100
"nrqlQueries": [
101101
{
102102
"accountId": 0,
103-
"query": "SELECT max(kafka.partition.under_min_isr) AS 'Under Min ISR Partitions' FROM Metric WHERE metricName = 'kafka.partition.under_min_isr' TIMESERIES AUTO"
103+
"query": "SELECT max(kafka.partition.under_min_isr) AS 'Under min ISR partitions' FROM Metric WHERE metricName = 'kafka.partition.under_min_isr' TIMESERIES AUTO"
104104
}
105105
]
106106
}
107107
},
108108
{
109-
"title": "In-Sync Replica Operations (per min)",
109+
"title": "In-sync replica operations (per min)",
110110
"layout": {
111111
"column": 1,
112112
"row": 8,
@@ -120,13 +120,13 @@
120120
"nrqlQueries": [
121121
{
122122
"accountId": 0,
123-
"query": "SELECT rate(sum(kafka.isr.operation.count), 1 minute) AS 'ISR Operations' FROM Metric WHERE metricName = 'kafka.isr.operation.count' TIMESERIES AUTO FACET operation"
123+
"query": "SELECT rate(sum(kafka.isr.operation.count), 1 minute) AS 'ISR operations' FROM Metric WHERE metricName = 'kafka.isr.operation.count' TIMESERIES AUTO FACET operation"
124124
}
125125
]
126126
}
127127
},
128128
{
129-
"title": "Max Replica Lag (Messages)",
129+
"title": "Max replica lag (messages)",
130130
"layout": {
131131
"column": 5,
132132
"row": 8,
@@ -146,7 +146,7 @@
146146
}
147147
},
148148
{
149-
"title": "Request Latency (99th Percentile) (ms)",
149+
"title": "Request latency (99th percentile) (ms)",
150150
"layout": {
151151
"column": 9,
152152
"row": 8,
@@ -160,13 +160,13 @@
160160
"nrqlQueries": [
161161
{
162162
"accountId": 0,
163-
"query": "SELECT filter(average(`kafka.request.time.99p`), WHERE type='Produce') AS `Produce`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchConsumer') AS `Consumer Fetch`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchFollower') AS `Follower Fetch` FROM Metric WHERE metricName = 'kafka.request.time.99p' TIMESERIES AUTO"
163+
"query": "SELECT filter(average(`kafka.request.time.99p`), WHERE type='Produce') AS `Produce`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchConsumer') AS `Consumer fetch`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchFollower') AS `Follower fetch` FROM Metric WHERE metricName = 'kafka.request.time.99p' TIMESERIES AUTO"
164164
}
165165
]
166166
}
167167
},
168168
{
169-
"title": "Requests In Per Second",
169+
"title": "Requests per second",
170170
"layout": {
171171
"column": 1,
172172
"row": 11,
@@ -180,13 +180,13 @@
180180
"nrqlQueries": [
181181
{
182182
"accountId": 0,
183-
"query": "SELECT rate(sum(kafka.request.count), 1 second) AS `Requests Per Second` FROM Metric WHERE metricName = 'kafka.request.count' AND type IN ('fetch', 'produce') TIMESERIES AUTO FACET type"
183+
"query": "SELECT rate(sum(kafka.request.count), 1 second) AS `Requests per second` FROM Metric WHERE metricName = 'kafka.request.count' AND type IN ('fetch', 'produce') TIMESERIES AUTO FACET type"
184184
}
185185
]
186186
}
187187
},
188188
{
189-
"title": "Request Failures Per Second",
189+
"title": "Request failures per second",
190190
"layout": {
191191
"column": 5,
192192
"row": 11,
@@ -200,13 +200,13 @@
200200
"nrqlQueries": [
201201
{
202202
"accountId": 0,
203-
"query": "SELECT sum(kafka.request.failed) / sum((endTimestamp - timestamp) / 1000) AS `Failed Requests` FROM Metric WHERE metricName = 'kafka.request.failed' TIMESERIES AUTO FACET type"
203+
"query": "SELECT sum(kafka.request.failed) / sum((endTimestamp - timestamp) / 1000) AS `Failed requests` FROM Metric WHERE metricName = 'kafka.request.failed' TIMESERIES AUTO FACET type"
204204
}
205205
]
206206
}
207207
},
208208
{
209-
"title": "Requests Waiting in Purgatory",
209+
"title": "Requests waiting in purgatory",
210210
"layout": {
211211
"column": 9,
212212
"row": 11,
@@ -220,7 +220,7 @@
220220
"nrqlQueries": [
221221
{
222222
"accountId": 0,
223-
"query": "SELECT average(kafka.purgatory.size) AS `Requests Waiting in Purgatory` FROM Metric WHERE metricName = 'kafka.purgatory.size' facet type TIMESERIES AUTO"
223+
"query": "SELECT average(kafka.purgatory.size) AS `Requests waiting in purgatory` FROM Metric WHERE metricName = 'kafka.purgatory.size' facet type TIMESERIES AUTO"
224224
}
225225
]
226226
}
@@ -240,7 +240,7 @@
240240
"nrqlQueries": [
241241
{
242242
"accountId": 0,
243-
"query": "SELECT sum(`jvm.gc.collections.count`) AS 'Garbage Collections Count' FROM Metric WHERE metricName = 'jvm.gc.collections.count' TIMESERIES AUTO"
243+
"query": "SELECT sum(`jvm.gc.collections.count`) AS 'Garbage collections count' FROM Metric WHERE metricName = 'jvm.gc.collections.count' TIMESERIES AUTO"
244244
}
245245
]
246246
}
@@ -260,7 +260,7 @@
260260
"nrqlQueries": [
261261
{
262262
"accountId": 0,
263-
"query": "SELECT average(`jvm.memory.heap.used`) AS `Heap Used in Bytes`, average(`jvm.memory.heap.max`) AS `Heap Max in Bytes` FROM Metric WHERE metricName IN ('jvm.memory.heap.used', 'jvm.memory.heap.max') TIMESERIES AUTO"
263+
"query": "SELECT average(`jvm.memory.heap.used`) AS `Heap used in bytes`, average(`jvm.memory.heap.max`) AS `Heap max in bytes` FROM Metric WHERE metricName IN ('jvm.memory.heap.used', 'jvm.memory.heap.max') TIMESERIES AUTO"
264264
}
265265
]
266266
}
@@ -280,7 +280,7 @@
280280
"nrqlQueries": [
281281
{
282282
"accountId": 0,
283-
"query": "SELECT average(`jvm.threads.count`) AS `Thread Count` FROM Metric WHERE metricName = 'jvm.threads.count' TIMESERIES AUTO"
283+
"query": "SELECT average(`jvm.threads.count`) AS `Thread count` FROM Metric WHERE metricName = 'jvm.threads.count' TIMESERIES AUTO"
284284
}
285285
]
286286
}
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
incomingMessagesPerSecond:
2-
goldenMetric: incomingMessagesPerSecond
3-
unit: OPERATIONS_PER_SECOND
4-
title: Incoming messages per second
5-
bytesInPerSecond:
62
goldenMetric: bytesInPerSecond
73
unit: BYTES_PER_SECOND
84
title: Bytes in per second
9-
produceRequestLatency99p:
10-
goldenMetric: produceRequestLatency99p
11-
unit: MS
12-
title: Produce request latency (99th percentile)
5+
bytesOutSecond:
6+
goldenMetric: bytesOutSecond
7+
unit: BYTES_PER_SECOND
8+
title: Bytes out per second
9+
underReplicatedPartitions:
10+
goldenMetric: underReplicatedPartitions
11+
unit: COUNT
12+
title: Under replicated partitions

0 commit comments

Comments
 (0)