Skip to content

Commit 48b683b

Browse files
shashank-reddy-nrdavidgonzalez-nrvetusbs
authored
fix: Fixed OTEL Kafka cluster definition (#2440)
* fix: Fix OTEL Kafka Dashboards * fix: Corrected dashboard jsons * fix: validation fix * fix: Fixed summary metrics * fix: Fix cluster def * fix: Fix cluster def * fix: Fixed parition count summary metrics * fix: Fixed underrepliacated parts golden metrics * fix: Fixed underrepliacated parts golden metrics * fix: Fixed underrepliacated parts golden metrics * fix: Fixed underrepliacated parts golden metrics * fix: Fixed underrepliacated parts golden metrics * fix: Golden metrics exp * fix: Golden metrics exp * fix: Golden metrics exp * fix: Golden metrics exp * fix: Golden metrics exp * fix: Golden metrics exp * fix: Golden metrics exp * fix: Updated otel kafka dashboard --------- Co-authored-by: David Gonzalez <[email protected]> Co-authored-by: vetusbs <[email protected]>
1 parent ffde25a commit 48b683b

File tree

6 files changed

+82
-43
lines changed

6 files changed

+82
-43
lines changed

entity-types/infra-kafkabroker/golden_metrics.stg.yml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ incomingMessagesPerSecond:
1212
from: Metric
1313
where: metricName = 'kafka.message.count'
1414
eventId: entity.guid
15-
eventName: kafka.broker.name
1615

1716
bytesInPerSecond:
1817
title: Bytes in per second
@@ -28,7 +27,6 @@ bytesInPerSecond:
2827
from: Metric
2928
where: metricName = 'kafka.network.io' AND (state = 'in' OR direction = 'in')
3029
eventId: entity.guid
31-
eventName: kafka.broker.name
3230

3331
underReplicatedPartitions:
3432
title: Under replicated partitions
@@ -44,7 +42,6 @@ underReplicatedPartitions:
4442
from: Metric
4543
where: metricName = 'kafka.partition.under_replicated'
4644
eventId: entity.guid
47-
eventName: kafka.broker.name
4845

4946
produceRequestLatency99p:
5047
title: Produce request latency (99th percentile)
@@ -60,7 +57,6 @@ produceRequestLatency99p:
6057
from: Metric
6158
where: metricName = 'kafka.request.time.99p' AND type = 'Produce'
6259
eventId: entity.guid
63-
eventName: kafka.broker.name
6460

6561
failedRequestsPerSecond:
6662
title: Failed requests per second
@@ -75,5 +71,4 @@ failedRequestsPerSecond:
7571
select: sum(kafka.request.failed) / sum((endTimestamp - timestamp) / 1000)
7672
from: Metric
7773
where: metricName = 'kafka.request.failed'
78-
eventId: entity.guid
79-
eventName: kafka.broker.name
74+
eventId: entity.guid

entity-types/infra-kafkabroker/opentelemetry_dashboard.stg.json

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,13 @@
8080
"nrqlQueries": [
8181
{
8282
"accountId": 0,
83-
"query": "SELECT latest(kafka.partition.under_replicated) AS 'Under Replicated Partitions' FROM Metric WHERE metricName='kafka.partition.under_replicated' TIMESERIES AUTO"
83+
"query": "SELECT max(kafka.partition.under_replicated) AS 'Under Replicated Partitions' FROM Metric WHERE metricName='kafka.partition.under_replicated' TIMESERIES AUTO"
8484
}
8585
]
8686
}
8787
},
8888
{
89-
"title": "In-Sync Replica Operations (per min)",
89+
"title": "Under Min ISR Partitions",
9090
"layout": {
9191
"column": 7,
9292
"row": 5,
@@ -100,13 +100,13 @@
100100
"nrqlQueries": [
101101
{
102102
"accountId": 0,
103-
"query": "SELECT rate(sum(kafka.isr.operation.count), 1 minute) AS 'ISR Operations' FROM Metric WHERE metricName = 'kafka.isr.operation.count' TIMESERIES AUTO FACET operation"
103+
"query": "SELECT max(kafka.partition.under_min_isr) AS 'Under Min ISR Partitions' FROM Metric WHERE metricName = 'kafka.partition.under_min_isr' TIMESERIES AUTO"
104104
}
105105
]
106106
}
107107
},
108108
{
109-
"title": "Max Replica Lag (Messages)",
109+
"title": "In-Sync Replica Operations (per min)",
110110
"layout": {
111111
"column": 1,
112112
"row": 8,
@@ -120,13 +120,13 @@
120120
"nrqlQueries": [
121121
{
122122
"accountId": 0,
123-
"query": "SELECT max(kafka.max.lag) as 'Messages' FROM Metric WHERE metricName = 'kafka.max.lag' TIMESERIES AUTO"
123+
"query": "SELECT rate(sum(kafka.isr.operation.count), 1 minute) AS 'ISR Operations' FROM Metric WHERE metricName = 'kafka.isr.operation.count' TIMESERIES AUTO FACET operation"
124124
}
125125
]
126126
}
127127
},
128128
{
129-
"title": "Requests In Per Second",
129+
"title": "Max Replica Lag (Messages)",
130130
"layout": {
131131
"column": 5,
132132
"row": 8,
@@ -140,13 +140,13 @@
140140
"nrqlQueries": [
141141
{
142142
"accountId": 0,
143-
"query": "SELECT rate(sum(kafka.request.count), 1 second) AS `Requests Per Second` FROM Metric WHERE metricName = 'kafka.request.count' AND type IN ('fetch', 'produce') TIMESERIES AUTO FACET type"
143+
"query": "SELECT max(kafka.max.lag) as 'Messages' FROM Metric WHERE metricName = 'kafka.max.lag' TIMESERIES AUTO"
144144
}
145145
]
146146
}
147147
},
148148
{
149-
"title": "Request Failures Per Second",
149+
"title": "Request Latency (99th Percentile) (ms)",
150150
"layout": {
151151
"column": 9,
152152
"row": 8,
@@ -160,13 +160,13 @@
160160
"nrqlQueries": [
161161
{
162162
"accountId": 0,
163-
"query": "SELECT sum(kafka.request.failed) / sum((endTimestamp - timestamp) / 1000) AS `Failed Requests` FROM Metric WHERE metricName = 'kafka.request.failed' TIMESERIES AUTO FACET type"
163+
"query": "SELECT filter(average(`kafka.request.time.99p`), WHERE type='Produce') AS `Produce`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchConsumer') AS `Consumer Fetch`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchFollower') AS `Follower Fetch` FROM Metric WHERE metricName = 'kafka.request.time.99p' TIMESERIES AUTO"
164164
}
165165
]
166166
}
167167
},
168168
{
169-
"title": "Request Latency (99th Percentile) (ms)",
169+
"title": "Requests In Per Second",
170170
"layout": {
171171
"column": 1,
172172
"row": 11,
@@ -180,13 +180,13 @@
180180
"nrqlQueries": [
181181
{
182182
"accountId": 0,
183-
"query": "SELECT filter(average(`kafka.request.time.99p`), WHERE type='Produce') AS `Produce`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchConsumer') AS `Consumer Fetch`, filter(average(`kafka.request.time.99p`), WHERE type = 'FetchFollower') AS `Follower Fetch` FROM Metric WHERE metricName = 'kafka.request.time.99p' TIMESERIES AUTO"
183+
"query": "SELECT rate(sum(kafka.request.count), 1 second) AS `Requests Per Second` FROM Metric WHERE metricName = 'kafka.request.count' AND type IN ('fetch', 'produce') TIMESERIES AUTO FACET type"
184184
}
185185
]
186186
}
187187
},
188188
{
189-
"title": "Request Queue Size",
189+
"title": "Request Failures Per Second",
190190
"layout": {
191191
"column": 5,
192192
"row": 11,
@@ -200,7 +200,7 @@
200200
"nrqlQueries": [
201201
{
202202
"accountId": 0,
203-
"query": "SELECT average(kafka.request.queue) as `Request Queue Size` FROM Metric WHERE metricName = 'kafka.request.queue' TIMESERIES AUTO"
203+
"query": "SELECT sum(kafka.request.failed) / sum((endTimestamp - timestamp) / 1000) AS `Failed Requests` FROM Metric WHERE metricName = 'kafka.request.failed' TIMESERIES AUTO FACET type"
204204
}
205205
]
206206
}

entity-types/infra-kafkacluster/definition.stg.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ synthesis:
1919
present: false
2020
- attribute: topic
2121
present: false
22+
- attribute: client-id
23+
present: false
24+
- attribute: service.name
25+
present: false
2226
- attribute: instrumentation.provider
2327
value: opentelemetry
2428
tags:

entity-types/infra-kafkacluster/golden_metrics.stg.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ partitionCount:
4343
opentelemetry:
4444
select: latest(kafka.partition.count)
4545
from: Metric
46-
where: metricName='kafka.partition.count'
46+
where: metricName='kafka.cluster.partition.count'
4747
eventId: entity.guid
4848

4949
offlinePartitions:
Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,60 @@
1-
partitionsWithNonPreferredLeader:
2-
title: Partitions with non-preferred leader
1+
topicPartitionsCount:
2+
queries:
3+
opentelemetry:
4+
select: latest(kafka.topic.partitions)
5+
from: Metric
6+
where: metricName = 'kafka.topic.partitions'
7+
eventId: entity.guid
38
unit: COUNT
9+
title: Topic Partitions
10+
11+
bytesInPerSecond:
12+
title: Bytes in per second
13+
unit: BYTES_PER_SECOND
14+
queries:
15+
opentelemetry:
16+
select: rate(sum(kafka.topic.io), 1 second)
17+
from: Metric
18+
where: metricName = 'kafka.topic.io' AND (state = 'in' OR direction = 'in')
19+
eventId: entity.guid
20+
21+
bytesOutPerSecond:
22+
title: Bytes out per second
23+
unit: BYTES_PER_SECOND
24+
queries:
25+
opentelemetry:
26+
select: rate(sum(kafka.topic.io), 1 second)
27+
from: Metric
28+
where: metricName = 'kafka.topic.io' AND (state = 'out' OR direction = 'out')
29+
eventId: entity.guid
30+
31+
messagesInPerSecond:
32+
title: Messages in per second
33+
unit: MESSAGES_PER_SECOND
34+
queries:
35+
opentelemetry:
36+
select: rate(sum(kafka.prod.msg.count), 1 second)
37+
from: Metric
38+
where: metricName = 'kafka.prod.msg.count'
39+
eventId: entity.guid
40+
41+
# TO-DO: Need to find a way how we can generate below metrics in OTEL
42+
underReplicatedPartitions:
43+
unit: COUNT
44+
title: Under replicated partitions
445
queries:
546
newRelic:
6-
select: average(topic.partitionsWithNonPreferredLeader)
47+
select: latest(topic.underReplicatedPartitions)
748
from: KafkaTopicSample
849
eventId: entityGuid
950
eventName: entityName
10-
underReplicatedPartitions:
51+
52+
partitionsWithNonPreferredLeader:
53+
title: Partitions with non-preferred leader
54+
unit: COUNT
1155
queries:
1256
newRelic:
13-
select: average(topic.underReplicatedPartitions)
57+
select: average(topic.partitionsWithNonPreferredLeader)
1458
from: KafkaTopicSample
1559
eventId: entityGuid
16-
eventName: entityName
17-
opentelemetry:
18-
select: ((sum(kafka.partition.replicas) - sum(kafka.partition.replicas_in_sync))/sum(1), uniques(partition))
19-
from: Metric
20-
where: metricName IN ('kafka.partition.replicas', 'kafka.partition.replicas_in_sync')
21-
eventId: entity.guid
22-
eventName: entity.name
23-
unit: COUNT
24-
title: Under replicated partitions
60+
eventName: entityName
Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1-
hostName:
2-
title: Host Name
3-
unit: STRING
4-
tag:
5-
key: hostName
6-
7-
underReplicatedPartitions:
8-
goldenMetric: underReplicatedPartitions
1+
topicPartitionsCount:
2+
goldenMetric: topicPartitionsCount
93
unit: COUNT
10-
title: Under replicated partitions
4+
title: Topic Partitions
5+
bytesInPerSecond:
6+
goldenMetric: bytesInPerSecond
7+
unit: BYTES_PER_SECOND
8+
title: Bytes in per second
9+
bytesOutPerSecond:
10+
goldenMetric: bytesOutPerSecond
11+
unit: BYTES_PER_SECOND
12+
title: Bytes out per second
13+
14+

0 commit comments

Comments
 (0)