Skip to content

Commit 4947a18

Browse files
committed
clean up route
1 parent 8ca3c1b commit 4947a18

File tree

4 files changed

+313
-253
lines changed

4 files changed

+313
-253
lines changed

x-pack/solutions/observability/plugins/observability_agent/server/plugin.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import { registerTools } from './tools/register_tools';
1717
import { registerAttachments } from './attachments/register_attachments';
1818
import { getIsObservabilityAgentEnabled } from './utils/get_is_obs_agent_enabled';
1919
import { OBSERVABILITY_AGENT_FEATURE_FLAG } from '../common/constants';
20-
import { registerAiInsightRoutes } from './routes/ai_insights/register_routes';
20+
import { registerAiInsightRoutes } from './routes/register_routes';
2121
import type {
2222
ObservabilityAgentPluginSetup,
2323
ObservabilityAgentPluginSetupDependencies,
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
import type { KibanaRequest, Logger } from '@kbn/core/server';
9+
import type { InferenceClient } from '@kbn/inference-common';
10+
import { MessageRole } from '@kbn/inference-common';
11+
import dedent from 'dedent';
12+
import type { ObservabilityAgentDataRegistry } from '../../data_registry/data_registry';
13+
14+
// Minimal shape of the alert document for AI insights.
15+
// This works for any alert that happens to have these fields populated.
16+
export interface AlertDocForInsight {
17+
'service.name'?: string;
18+
'service.environment'?: string;
19+
'transaction.type'?: string;
20+
'transaction.name'?: string;
21+
'kibana.alert.start'?: string | number;
22+
[key: string]: unknown;
23+
}
24+
25+
interface GetAlertAiInsightParams {
26+
alertDoc: AlertDocForInsight | undefined;
27+
inferenceClient: InferenceClient;
28+
connectorId: string | undefined;
29+
dataRegistry: ObservabilityAgentDataRegistry;
30+
request: KibanaRequest;
31+
logger: Logger;
32+
}
33+
34+
interface AlertAiInsightResult {
35+
summary: string;
36+
context: string;
37+
}
38+
39+
export async function getAlertAiInsight({
40+
alertDoc,
41+
inferenceClient,
42+
connectorId,
43+
dataRegistry,
44+
request,
45+
logger,
46+
}: GetAlertAiInsightParams): Promise<AlertAiInsightResult> {
47+
const context = await fetchAlertContext({ alertDoc, dataRegistry, request, logger });
48+
const summary = await generateAlertSummary({ inferenceClient, connectorId, context });
49+
50+
return { summary, context };
51+
}
52+
53+
async function fetchAlertContext({
54+
alertDoc,
55+
dataRegistry,
56+
request,
57+
logger,
58+
}: Pick<
59+
GetAlertAiInsightParams,
60+
'alertDoc' | 'dataRegistry' | 'request' | 'logger'
61+
>): Promise<string> {
62+
const serviceName = alertDoc?.['service.name'] ?? '';
63+
const serviceEnvironment = alertDoc?.['service.environment'] ?? '';
64+
const transactionType = alertDoc?.['transaction.type'];
65+
const transactionName = alertDoc?.['transaction.name'];
66+
const alertStartedAt = alertDoc?.['kibana.alert.start'];
67+
68+
const alertTime = alertStartedAt ? new Date(String(alertStartedAt)).getTime() : Date.now();
69+
const alertEnd = new Date(alertTime).toISOString();
70+
71+
// Time ranges for different data providers
72+
const serviceSummaryStart = new Date(alertTime - 5 * 60 * 1000).toISOString(); // 5 min before
73+
const downstreamStart = new Date(alertTime - 24 * 60 * 60 * 1000).toISOString(); // 24 hours before
74+
const errorsStart = new Date(alertTime - 15 * 60 * 1000).toISOString(); // 15 min before
75+
const changePointsStart = new Date(alertTime - 6 * 60 * 60 * 1000).toISOString(); // 6 hours before
76+
77+
const contextParts: string[] = [];
78+
79+
if (serviceName) {
80+
// APM Service Summary
81+
try {
82+
const summary = await dataRegistry.getData('apmServiceSummary', {
83+
request,
84+
serviceName,
85+
serviceEnvironment,
86+
start: serviceSummaryStart,
87+
end: alertEnd,
88+
transactionType,
89+
});
90+
if (summary) {
91+
contextParts.push(`Service Summary:\n${JSON.stringify(summary, null, 2)}`);
92+
}
93+
} catch (err) {
94+
logger.debug(`AI insight: apmServiceSummary failed: ${err}`);
95+
}
96+
97+
// APM Downstream Dependencies
98+
try {
99+
const downstream = await dataRegistry.getData('apmDownstreamDependencies', {
100+
request,
101+
serviceName,
102+
serviceEnvironment,
103+
start: downstreamStart,
104+
end: alertEnd,
105+
});
106+
if (downstream && downstream.length > 0) {
107+
contextParts.push(`Downstream Dependencies:\n${JSON.stringify(downstream, null, 2)}`);
108+
}
109+
} catch (err) {
110+
logger.debug(`AI insight: apmDownstreamDependencies failed: ${err}`);
111+
}
112+
113+
// APM Errors
114+
try {
115+
const errors = await dataRegistry.getData('apmErrors', {
116+
request,
117+
serviceName,
118+
serviceEnvironment,
119+
start: errorsStart,
120+
end: alertEnd,
121+
});
122+
if (errors && errors.length > 0) {
123+
contextParts.push(`APM Errors:\n${JSON.stringify(errors, null, 2)}`);
124+
}
125+
} catch (err) {
126+
logger.debug(`AI insight: apmErrors failed: ${err}`);
127+
}
128+
129+
// APM Service Change Points
130+
try {
131+
const serviceChangePoints = await dataRegistry.getData('apmServiceChangePoints', {
132+
request,
133+
serviceName,
134+
serviceEnvironment,
135+
transactionType,
136+
transactionName,
137+
start: changePointsStart,
138+
end: alertEnd,
139+
});
140+
if (serviceChangePoints && serviceChangePoints.length > 0) {
141+
contextParts.push(
142+
`Service Change Points:\n${JSON.stringify(serviceChangePoints, null, 2)}`
143+
);
144+
}
145+
} catch (err) {
146+
logger.debug(`AI insight: apmServiceChangePoints failed: ${err}`);
147+
}
148+
149+
// APM Exit Span Change Points
150+
try {
151+
const exitSpanChangePoints = await dataRegistry.getData('apmExitSpanChangePoints', {
152+
request,
153+
serviceName,
154+
serviceEnvironment,
155+
start: changePointsStart,
156+
end: alertEnd,
157+
});
158+
if (exitSpanChangePoints && exitSpanChangePoints.length > 0) {
159+
contextParts.push(
160+
`Exit Span Change Points:\n${JSON.stringify(exitSpanChangePoints, null, 2)}`
161+
);
162+
}
163+
} catch (err) {
164+
logger.debug(`AI insight: apmExitSpanChangePoints failed: ${err}`);
165+
}
166+
}
167+
168+
return contextParts.length > 0 ? contextParts.join('\n\n') : 'No related signals available.';
169+
}
170+
171+
async function generateAlertSummary({
172+
inferenceClient,
173+
connectorId,
174+
context,
175+
}: {
176+
inferenceClient: any;
177+
connectorId: string | undefined;
178+
context: string;
179+
}): Promise<string> {
180+
const system = dedent(`
181+
You are an SRE assistant. Help an SRE quickly understand likely cause, impact, and next actions for this alert using the provided context.
182+
183+
Output shape (plain text):
184+
- Summary (1–2 sentences): What is likely happening and why it matters. If recovered, acknowledge and reduce urgency. If no strong signals, say "Inconclusive" and briefly note why.
185+
- Assessment: Most plausible explanation or "Inconclusive" if signals do not support a clear assessment.
186+
- Related signals (top 3–5, each with provenance and relevance): For each item, include source (change points | errors | log rate | log categories | anomalies | service summary), timeframe near alert start, and relevance to alert scope as Direct | Indirect | Unrelated.
187+
- Immediate actions (2–3): Concrete next checks or fixes an SRE can take now.
188+
189+
Guardrails:
190+
- Do not repeat the alert reason string or rule name verbatim.
191+
- Only provide a non‑inconclusive Assessment when supported by on‑topic related signals; otherwise set Assessment to "Inconclusive" and do not speculate a cause.
192+
- Corroboration: prefer assessment supported by multiple independent signal types; if only one source supports it, state that support is limited.
193+
- If signals are weak or conflicting, state that clearly and recommend the safest next diagnostic step.
194+
- Do not list raw alert fields as bullet points. Bullets are allowed only for Related signals and Immediate actions.
195+
- Keep it concise (~150–200 words).
196+
197+
Related signals hierarchy (use what exists, skip what doesn't):
198+
1) Change points (service and exit‑span): sudden shifts in throughput/latency/failure; name impacted downstream services verbatim when present and whether propagation is likely.
199+
2) Errors: signatures enriched with downstream resource/name; summarize patterns without long stacks; tie to alert scope.
200+
3) Logs: strongest log‑rate significant items and top categories; very short examples and implications; tie to alert scope.
201+
4) Anomalies: note ML anomalies around alert time; multiple affected services may imply systemic issues.
202+
5) Service summary: only details that materially change interpretation (avoid re‑listing fields).
203+
204+
Recovery / false positives:
205+
- If recovered or normalizing, recommend light‑weight validation and watchful follow‑up.
206+
- If inconclusive or signals skew Indirect/Unrelated, state that the alert may be unrelated/noisy and suggest targeted traces/logging for the suspected path.
207+
`);
208+
209+
const prompt = dedent(`
210+
Context:
211+
${context}
212+
213+
Task:
214+
Summarize likely cause, impact, and immediate next checks for this alert using the format above. Tie related signals to the alert scope; ignore unrelated noise. If signals are weak or conflicting, mark Assessment "Inconclusive" and propose the safest next diagnostic step.
215+
`);
216+
217+
const completion = await inferenceClient.chatComplete({
218+
connectorId: connectorId ?? '',
219+
system,
220+
messages: [{ role: MessageRole.User, content: prompt }],
221+
});
222+
223+
return completion.content ?? '';
224+
}

0 commit comments

Comments
 (0)