|
| 1 | +/* |
| 2 | + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one |
| 3 | + * or more contributor license agreements. Licensed under the Elastic License |
| 4 | + * 2.0; you may not use this file except in compliance with the Elastic License |
| 5 | + * 2.0. |
| 6 | + */ |
| 7 | + |
| 8 | +import type { KibanaRequest, Logger } from '@kbn/core/server'; |
| 9 | +import type { InferenceClient } from '@kbn/inference-common'; |
| 10 | +import { MessageRole } from '@kbn/inference-common'; |
| 11 | +import dedent from 'dedent'; |
| 12 | +import type { ObservabilityAgentDataRegistry } from '../../data_registry/data_registry'; |
| 13 | + |
| 14 | +// Minimal shape of the alert document for AI insights. |
| 15 | +// This works for any alert that happens to have these fields populated. |
| 16 | +export interface AlertDocForInsight { |
| 17 | + 'service.name'?: string; |
| 18 | + 'service.environment'?: string; |
| 19 | + 'transaction.type'?: string; |
| 20 | + 'transaction.name'?: string; |
| 21 | + 'kibana.alert.start'?: string | number; |
| 22 | + [key: string]: unknown; |
| 23 | +} |
| 24 | + |
| 25 | +interface GetAlertAiInsightParams { |
| 26 | + alertDoc: AlertDocForInsight | undefined; |
| 27 | + inferenceClient: InferenceClient; |
| 28 | + connectorId: string | undefined; |
| 29 | + dataRegistry: ObservabilityAgentDataRegistry; |
| 30 | + request: KibanaRequest; |
| 31 | + logger: Logger; |
| 32 | +} |
| 33 | + |
| 34 | +interface AlertAiInsightResult { |
| 35 | + summary: string; |
| 36 | + context: string; |
| 37 | +} |
| 38 | + |
| 39 | +export async function getAlertAiInsight({ |
| 40 | + alertDoc, |
| 41 | + inferenceClient, |
| 42 | + connectorId, |
| 43 | + dataRegistry, |
| 44 | + request, |
| 45 | + logger, |
| 46 | +}: GetAlertAiInsightParams): Promise<AlertAiInsightResult> { |
| 47 | + const context = await fetchAlertContext({ alertDoc, dataRegistry, request, logger }); |
| 48 | + const summary = await generateAlertSummary({ inferenceClient, connectorId, context }); |
| 49 | + |
| 50 | + return { summary, context }; |
| 51 | +} |
| 52 | + |
| 53 | +async function fetchAlertContext({ |
| 54 | + alertDoc, |
| 55 | + dataRegistry, |
| 56 | + request, |
| 57 | + logger, |
| 58 | +}: Pick< |
| 59 | + GetAlertAiInsightParams, |
| 60 | + 'alertDoc' | 'dataRegistry' | 'request' | 'logger' |
| 61 | +>): Promise<string> { |
| 62 | + const serviceName = alertDoc?.['service.name'] ?? ''; |
| 63 | + const serviceEnvironment = alertDoc?.['service.environment'] ?? ''; |
| 64 | + const transactionType = alertDoc?.['transaction.type']; |
| 65 | + const transactionName = alertDoc?.['transaction.name']; |
| 66 | + const alertStartedAt = alertDoc?.['kibana.alert.start']; |
| 67 | + |
| 68 | + const alertTime = alertStartedAt ? new Date(String(alertStartedAt)).getTime() : Date.now(); |
| 69 | + const alertEnd = new Date(alertTime).toISOString(); |
| 70 | + |
| 71 | + // Time ranges for different data providers |
| 72 | + const serviceSummaryStart = new Date(alertTime - 5 * 60 * 1000).toISOString(); // 5 min before |
| 73 | + const downstreamStart = new Date(alertTime - 24 * 60 * 60 * 1000).toISOString(); // 24 hours before |
| 74 | + const errorsStart = new Date(alertTime - 15 * 60 * 1000).toISOString(); // 15 min before |
| 75 | + const changePointsStart = new Date(alertTime - 6 * 60 * 60 * 1000).toISOString(); // 6 hours before |
| 76 | + |
| 77 | + const contextParts: string[] = []; |
| 78 | + |
| 79 | + if (serviceName) { |
| 80 | + // APM Service Summary |
| 81 | + try { |
| 82 | + const summary = await dataRegistry.getData('apmServiceSummary', { |
| 83 | + request, |
| 84 | + serviceName, |
| 85 | + serviceEnvironment, |
| 86 | + start: serviceSummaryStart, |
| 87 | + end: alertEnd, |
| 88 | + transactionType, |
| 89 | + }); |
| 90 | + if (summary) { |
| 91 | + contextParts.push(`Service Summary:\n${JSON.stringify(summary, null, 2)}`); |
| 92 | + } |
| 93 | + } catch (err) { |
| 94 | + logger.debug(`AI insight: apmServiceSummary failed: ${err}`); |
| 95 | + } |
| 96 | + |
| 97 | + // APM Downstream Dependencies |
| 98 | + try { |
| 99 | + const downstream = await dataRegistry.getData('apmDownstreamDependencies', { |
| 100 | + request, |
| 101 | + serviceName, |
| 102 | + serviceEnvironment, |
| 103 | + start: downstreamStart, |
| 104 | + end: alertEnd, |
| 105 | + }); |
| 106 | + if (downstream && downstream.length > 0) { |
| 107 | + contextParts.push(`Downstream Dependencies:\n${JSON.stringify(downstream, null, 2)}`); |
| 108 | + } |
| 109 | + } catch (err) { |
| 110 | + logger.debug(`AI insight: apmDownstreamDependencies failed: ${err}`); |
| 111 | + } |
| 112 | + |
| 113 | + // APM Errors |
| 114 | + try { |
| 115 | + const errors = await dataRegistry.getData('apmErrors', { |
| 116 | + request, |
| 117 | + serviceName, |
| 118 | + serviceEnvironment, |
| 119 | + start: errorsStart, |
| 120 | + end: alertEnd, |
| 121 | + }); |
| 122 | + if (errors && errors.length > 0) { |
| 123 | + contextParts.push(`APM Errors:\n${JSON.stringify(errors, null, 2)}`); |
| 124 | + } |
| 125 | + } catch (err) { |
| 126 | + logger.debug(`AI insight: apmErrors failed: ${err}`); |
| 127 | + } |
| 128 | + |
| 129 | + // APM Service Change Points |
| 130 | + try { |
| 131 | + const serviceChangePoints = await dataRegistry.getData('apmServiceChangePoints', { |
| 132 | + request, |
| 133 | + serviceName, |
| 134 | + serviceEnvironment, |
| 135 | + transactionType, |
| 136 | + transactionName, |
| 137 | + start: changePointsStart, |
| 138 | + end: alertEnd, |
| 139 | + }); |
| 140 | + if (serviceChangePoints && serviceChangePoints.length > 0) { |
| 141 | + contextParts.push( |
| 142 | + `Service Change Points:\n${JSON.stringify(serviceChangePoints, null, 2)}` |
| 143 | + ); |
| 144 | + } |
| 145 | + } catch (err) { |
| 146 | + logger.debug(`AI insight: apmServiceChangePoints failed: ${err}`); |
| 147 | + } |
| 148 | + |
| 149 | + // APM Exit Span Change Points |
| 150 | + try { |
| 151 | + const exitSpanChangePoints = await dataRegistry.getData('apmExitSpanChangePoints', { |
| 152 | + request, |
| 153 | + serviceName, |
| 154 | + serviceEnvironment, |
| 155 | + start: changePointsStart, |
| 156 | + end: alertEnd, |
| 157 | + }); |
| 158 | + if (exitSpanChangePoints && exitSpanChangePoints.length > 0) { |
| 159 | + contextParts.push( |
| 160 | + `Exit Span Change Points:\n${JSON.stringify(exitSpanChangePoints, null, 2)}` |
| 161 | + ); |
| 162 | + } |
| 163 | + } catch (err) { |
| 164 | + logger.debug(`AI insight: apmExitSpanChangePoints failed: ${err}`); |
| 165 | + } |
| 166 | + } |
| 167 | + |
| 168 | + return contextParts.length > 0 ? contextParts.join('\n\n') : 'No related signals available.'; |
| 169 | +} |
| 170 | + |
| 171 | +async function generateAlertSummary({ |
| 172 | + inferenceClient, |
| 173 | + connectorId, |
| 174 | + context, |
| 175 | +}: { |
| 176 | + inferenceClient: any; |
| 177 | + connectorId: string | undefined; |
| 178 | + context: string; |
| 179 | +}): Promise<string> { |
| 180 | + const system = dedent(` |
| 181 | + You are an SRE assistant. Help an SRE quickly understand likely cause, impact, and next actions for this alert using the provided context. |
| 182 | +
|
| 183 | + Output shape (plain text): |
| 184 | + - Summary (1–2 sentences): What is likely happening and why it matters. If recovered, acknowledge and reduce urgency. If no strong signals, say "Inconclusive" and briefly note why. |
| 185 | + - Assessment: Most plausible explanation or "Inconclusive" if signals do not support a clear assessment. |
| 186 | + - Related signals (top 3–5, each with provenance and relevance): For each item, include source (change points | errors | log rate | log categories | anomalies | service summary), timeframe near alert start, and relevance to alert scope as Direct | Indirect | Unrelated. |
| 187 | + - Immediate actions (2–3): Concrete next checks or fixes an SRE can take now. |
| 188 | +
|
| 189 | + Guardrails: |
| 190 | + - Do not repeat the alert reason string or rule name verbatim. |
| 191 | + - Only provide a non‑inconclusive Assessment when supported by on‑topic related signals; otherwise set Assessment to "Inconclusive" and do not speculate a cause. |
| 192 | + - Corroboration: prefer assessment supported by multiple independent signal types; if only one source supports it, state that support is limited. |
| 193 | + - If signals are weak or conflicting, state that clearly and recommend the safest next diagnostic step. |
| 194 | + - Do not list raw alert fields as bullet points. Bullets are allowed only for Related signals and Immediate actions. |
| 195 | + - Keep it concise (~150–200 words). |
| 196 | +
|
| 197 | + Related signals hierarchy (use what exists, skip what doesn't): |
| 198 | + 1) Change points (service and exit‑span): sudden shifts in throughput/latency/failure; name impacted downstream services verbatim when present and whether propagation is likely. |
| 199 | + 2) Errors: signatures enriched with downstream resource/name; summarize patterns without long stacks; tie to alert scope. |
| 200 | + 3) Logs: strongest log‑rate significant items and top categories; very short examples and implications; tie to alert scope. |
| 201 | + 4) Anomalies: note ML anomalies around alert time; multiple affected services may imply systemic issues. |
| 202 | + 5) Service summary: only details that materially change interpretation (avoid re‑listing fields). |
| 203 | +
|
| 204 | + Recovery / false positives: |
| 205 | + - If recovered or normalizing, recommend light‑weight validation and watchful follow‑up. |
| 206 | + - If inconclusive or signals skew Indirect/Unrelated, state that the alert may be unrelated/noisy and suggest targeted traces/logging for the suspected path. |
| 207 | + `); |
| 208 | + |
| 209 | + const prompt = dedent(` |
| 210 | + Context: |
| 211 | + ${context} |
| 212 | +
|
| 213 | + Task: |
| 214 | + Summarize likely cause, impact, and immediate next checks for this alert using the format above. Tie related signals to the alert scope; ignore unrelated noise. If signals are weak or conflicting, mark Assessment "Inconclusive" and propose the safest next diagnostic step. |
| 215 | + `); |
| 216 | + |
| 217 | + const completion = await inferenceClient.chatComplete({ |
| 218 | + connectorId: connectorId ?? '', |
| 219 | + system, |
| 220 | + messages: [{ role: MessageRole.User, content: prompt }], |
| 221 | + }); |
| 222 | + |
| 223 | + return completion.content ?? ''; |
| 224 | +} |
0 commit comments