[Obs AI Assistant] migrate connector evaluation to new framework (#234219)

neptunian · kibanamachine · web-flow · commit 07e9b9f6df01 · 2025-09-15T09:46:05.000-04:00
Migrate connector evaluation to new framework. <details> <summary>results from single run</summary> <img width="683" height="597" alt="Screenshot 2025-09-08 at 16 58 59" src="https://github.com/user-attachments/assets/eac3171a-81f6-4965-8ef2-f53fcc8625db" /> <img width="682" height="606" alt="Screenshot 2025-09-08 at 16 58 03" src="https://github.com/user-attachments/assets/0a51bd44-6294-419d-90e1-7fea3ebfab0b" /> <img width="672" height="600" alt="Screenshot 2025-09-08 at 16 56 21" src="https://github.com/user-attachments/assets/11222b2f-35c3-4fd2-b8d5-b039973a08bc" /> <img width="699" height="590" alt="Screenshot 2025-09-08 at 16 56 12" src="https://github.com/user-attachments/assets/045aa493-3673-4b35-9f19-260a1d3742a7" /> <img width="687" height="610" alt="Screenshot 2025-09-08 at 16 55 07" src="https://github.com/user-attachments/assets/2600898d-debe-49d1-984e-56134b7eab4a" /> <img width="689" height="594" alt="Screenshot 2025-09-08 at 16 54 03" src="https://github.com/user-attachments/assets/a73e5fa3-73c1-45df-a873-39ccad36e9a7" /> <img width="674" height="598" alt="Screenshot 2025-09-08 at 16 53 58" src="https://github.com/user-attachments/assets/56b57da8-8fb4-48e9-b942-eb3f5679e284" /> </details> ## Run tests Start the server: `node scripts/scout.js start-server --stateful` Run the tests: `EVALUATION_CONNECTOR_ID=gemini-2_5-pro node scripts/playwright test --config x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/playwright.config.ts --grep "execute_connector function"` --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/evals/connector/connector.spec.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/evals/connector/connector.spec.ts
@@ -0,0 +1,183 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { EXECUTE_CONNECTOR_FUNCTION_NAME } from '@kbn/observability-ai-assistant-plugin/common';
+import type { ActionConnector } from '@kbn/alerts-ui-shared/src/common/types';
+import { evaluate as base } from '../../src/evaluate';
+import type { EvaluateConnectorDataset } from './evaluate_connector_dataset';
+import { createEvaluateConnectorDataset } from './evaluate_connector_dataset';
+
+/**
+ * NOTE: This scenario has been migrated from the legacy evaluation framework.
+ * - x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/connector/index.spec.ts
+ * Any changes should be made in both places until the legacy evaluation framework is removed.
+ */
+
+const EMAIL_PROMPT =
+  'Send an email to user@test.com with the subject "Test Email" and body "This is a test email."';
+
+const EMAIL_EVAL_CRITERIA = [
+  `Uses the ${EXECUTE_CONNECTOR_FUNCTION_NAME} function to send the email before providing a final answer to the user.`,
+  'Clearly explains to the user that an email will be sent and summarizes the provided details (recipient, subject, body).',
+  'Confirms successful email delivery and includes the recipient, subject, and message in the summary.',
+  'Does not include irrelevant or unrelated information in the response.',
+];
+
+const evaluate = base.extend<{
+  evaluateConnectorDataset: EvaluateConnectorDataset;
+}>({
+  evaluateConnectorDataset: [
+    ({ chatClient, evaluators, phoenixClient }, use) => {
+      use(
+        createEvaluateConnectorDataset({
+          chatClient,
+          evaluators,
+          phoenixClient,
+        })
+      );
+    },
+    { scope: 'test' },
+  ],
+});
+
+evaluate.describe('execute_connector function', { tag: '@svlOblt' }, () => {
+  evaluate.describe('no email connector available', () => {
+    evaluate(
+      'does not send an email and fails gracefully',
+      async ({ evaluateConnectorDataset }) => {
+        await evaluateConnectorDataset({
+          dataset: {
+            name: 'connector: no email connector',
+            description: 'Validate behavior when no Actions email connector exists.',
+            examples: [
+              {
+                input: { prompt: EMAIL_PROMPT },
+                output: {
+                  criteria: [
+                    `Does not use ${EXECUTE_CONNECTOR_FUNCTION_NAME} function.`,
+                    'Explains that no connectors are available to send the email.',
+                    'Does not attempt to send an email.',
+                    'Mentions that sending the email was unsuccessful.',
+                  ],
+                },
+                metadata: {},
+              },
+            ],
+          },
+        });
+      }
+    );
+  });
+
+  evaluate.describe('with email connector', () => {
+    let emailConnectorId: string;
+
+    evaluate.beforeAll(async ({ kbnClient, log }) => {
+      const { data } = await kbnClient.request<ActionConnector>({
+        method: 'POST',
+        path: '/api/actions/connector',
+        body: {
+          name: 'email-connector-test',
+          config: {
+            from: 'test@example.com',
+            service: '__json',
+          },
+          secrets: {
+            user: 'test',
+            password: '123456',
+          },
+          connector_type_id: '.email',
+        },
+      });
+      log.success('Email connector created successfully');
+
+      emailConnectorId = data.id;
+    });
+
+    evaluate('sends an email (basic)', async ({ evaluateConnectorDataset }) => {
+      await evaluateConnectorDataset({
+        dataset: {
+          name: 'connector: with email connector (basic)',
+          description:
+            'Validates that the assistant uses execute_connector and summarizes correctly.',
+          examples: [
+            {
+              input: { prompt: EMAIL_PROMPT },
+              output: { criteria: EMAIL_EVAL_CRITERIA },
+              metadata: {},
+            },
+          ],
+        },
+      });
+    });
+
+    evaluate(
+      'sends an email using user instructions',
+      async ({ evaluateConnectorDataset, kbnClient }) => {
+        const instructions = `<email_instructions>
+      If the user's query requires sending an email:
+      1. Use the email connector type ".email" with ID "${emailConnectorId}".
+      2. Prepare the email parameters:
+        - Recipient email address(es) in the "to" field (array of strings)
+        - Subject in the "subject" field (string)
+        - Email body in the "message" field (string)
+      3. Include
+        - Details for the alert along with a link to the alert
+        - Root cause analysis
+        - All of the details we discussed in this conversation
+        - Remediation recommendations
+        - Link to Business Health Dashboard
+      4. Execute the connector using this format:
+        execute_connector(
+          id="${emailConnectorId}",
+          params={
+            "to": ["recipient@example.com"],
+            "subject": "Your Email Subject",
+            "message": "Your email content here."
+          }
+        )
+      5. Check the response and confirm if the email was sent successfully.
+  </email_instructions>`;
+
+        await kbnClient.request({
+          method: 'PUT',
+          path: '/internal/observability_ai_assistant/kb/user_instructions',
+          body: { id: 'send_email', text: instructions, public: false },
+        });
+
+        await evaluateConnectorDataset({
+          dataset: {
+            name: 'connector: with email connector (user instructions)',
+            description: 'Validates connector usage guided by user instructions.',
+            examples: [
+              {
+                input: { prompt: EMAIL_PROMPT },
+                output: { criteria: EMAIL_EVAL_CRITERIA },
+                metadata: {},
+              },
+            ],
+          },
+        });
+      }
+    );
+
+    evaluate.afterAll(async ({ kbnClient, log }) => {
+      // Delete the email connector
+      await kbnClient.request({
+        method: 'DELETE',
+        path: `/api/actions/connector/${emailConnectorId}`,
+      });
+      log.success('Email connector deleted');
+      // Delete the user instructions
+      await kbnClient.request({
+        method: 'DELETE',
+        path: '/internal/observability_ai_assistant/kb/entries/send_email',
+      });
+      log.success('User instructions deleted');
+    });
+  });
+});
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/evals/connector/evaluate_connector_dataset.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/evals/connector/evaluate_connector_dataset.ts
@@ -0,0 +1,85 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import type { Example } from '@arizeai/phoenix-client/dist/esm/types/datasets';
+import type { DefaultEvaluators, KibanaPhoenixClient } from '@kbn/evals';
+import type { EvaluationDataset } from '@kbn/evals/src/types';
+import type { ObservabilityAIAssistantEvaluationChatClient } from '../../src/chat_client';
+
+interface ConnectorExample extends Example {
+  input: {
+    prompt: string;
+  };
+  output: {
+    criteria: string[];
+  };
+}
+
+export type EvaluateConnectorDataset = ({
+  dataset: { name, description, examples },
+}: {
+  dataset: {
+    name: string;
+    description: string;
+    examples: ConnectorExample[];
+  };
+}) => Promise<void>;
+
+export function createEvaluateConnectorDataset({
+  evaluators,
+  phoenixClient,
+  chatClient,
+}: {
+  evaluators: DefaultEvaluators;
+  phoenixClient: KibanaPhoenixClient;
+  chatClient: ObservabilityAIAssistantEvaluationChatClient;
+}): EvaluateConnectorDataset {
+  return async function evaluateConnectorDataset({
+    dataset: { name, description, examples },
+  }: {
+    dataset: {
+      name: string;
+      description: string;
+      examples: ConnectorExample[];
+    };
+  }) {
+    const dataset = {
+      name,
+      description,
+      examples,
+    } satisfies EvaluationDataset;
+
+    await phoenixClient.runExperiment(
+      {
+        dataset,
+        task: async ({ input }) => {
+          const response = await chatClient.complete({
+            messages: input.prompt,
+          });
+
+          return {
+            errors: response.errors,
+            messages: response.messages,
+          };
+        },
+      },
+      [
+        {
+          name: 'connector-evaluator',
+          kind: 'LLM',
+          evaluate: async ({ input, output, expected, metadata }) => {
+            const result = await evaluators
+              .criteria(expected.criteria ?? [])
+              .evaluate({ input, expected, output, metadata });
+
+            return result;
+          },
+        },
+      ]
+    );
+  };
+}
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/src/evaluate.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/src/evaluate.ts
@@ -27,9 +27,12 @@ export const evaluate = base.extend<
     },
   ],
   chatClient: [
-    async ({ fetch, log, connector }, use, testInfo) => {
-      const chatClient = new ObservabilityAIAssistantEvaluationChatClient(fetch, log, connector.id);
+    async ({ fetch, log, connector, knowledgeBaseClient }, use) => {
+      // Ensure the KB fixture is initialized before creating the chat client.
+      // This guarantees KB is installed even if no spec references knowledgeBaseClient directly.
+      await knowledgeBaseClient.ensureInstalled();
 
+      const chatClient = new ObservabilityAIAssistantEvaluationChatClient(fetch, log, connector.id);
       await use(chatClient);
     },
     {
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/src/knowledge_base_client.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/src/knowledge_base_client.ts
@@ -30,13 +30,10 @@ export class KnowledgeBaseClient {
         const response = await this.fetch<{}>('/internal/observability_ai_assistant/kb/setup', {
           method: 'POST',
           query: {
+            inference_id: '.elser-2-elasticsearch',
             wait_until_complete: true,
           },
-          body: JSON.stringify({
-            query: {
-              inference_id: '.elser-2-elasticsearch',
-            },
-          }),
+          body: JSON.stringify({}),
         });
 
         this.log.info('Knowledge base is ready');
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/tsconfig.json b/x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/tsconfig.json
@@ -22,5 +22,6 @@
     "@kbn/core",
     "@kbn/core-http-browser",
     "@kbn/scout-oblt",
+    "@kbn/alerts-ui-shared",
   ]
 }
diff --git a/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/connector/index.spec.ts b/x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/connector/index.spec.ts
@@ -7,6 +7,12 @@
 
 /// <reference types="@kbn/ambient-ftr-types"/>
 
+/**
+ * NOTE: This scenario has been migrated to the new evaluation framework.
+ * - x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/evals/connector/connector.spec.ts
+ * Any changes should be made in both places until the legacy evaluation framework is removed.
+ */
+
 import expect from '@kbn/expect';
 import { EXECUTE_CONNECTOR_FUNCTION_NAME } from '@kbn/observability-ai-assistant-plugin/common';
 import { chatClient, kibanaClient, logger } from '../../services';
@@ -124,6 +130,10 @@ describe('execute_connector function', () => {
         pathname: `/api/actions/connector/${emailConnectorId}`,
       });
       logger.success('Email connector deleted');
+      // delete the user instructions
+      await fetch('/internal/observability_ai_assistant/kb/entries/send_email', {
+        method: 'DELETE',
+      });
     });
   });
 });

Original file line number	Diff line number	Diff line change
`@@ -22,5 +22,6 @@`
`22`	`22`	`"@kbn/core",`
`23`	`23`	`"@kbn/core-http-browser",`
`24`	`24`	`"@kbn/scout-oblt",`
	`25`	`+ "@kbn/alerts-ui-shared",`
`25`	`26`	`]`
`26`	`27`	`}`