Skip to content

Commit 07e9b9f

Browse files
[Obs AI Assistant] migrate connector evaluation to new framework (#234219)
Migrate connector evaluation to new framework. <details> <summary>results from single run</summary> <img width="683" height="597" alt="Screenshot 2025-09-08 at 16 58 59" src="https://github.com/user-attachments/assets/eac3171a-81f6-4965-8ef2-f53fcc8625db" /> <img width="682" height="606" alt="Screenshot 2025-09-08 at 16 58 03" src="https://github.com/user-attachments/assets/0a51bd44-6294-419d-90e1-7fea3ebfab0b" /> <img width="672" height="600" alt="Screenshot 2025-09-08 at 16 56 21" src="https://github.com/user-attachments/assets/11222b2f-35c3-4fd2-b8d5-b039973a08bc" /> <img width="699" height="590" alt="Screenshot 2025-09-08 at 16 56 12" src="https://github.com/user-attachments/assets/045aa493-3673-4b35-9f19-260a1d3742a7" /> <img width="687" height="610" alt="Screenshot 2025-09-08 at 16 55 07" src="https://github.com/user-attachments/assets/2600898d-debe-49d1-984e-56134b7eab4a" /> <img width="689" height="594" alt="Screenshot 2025-09-08 at 16 54 03" src="https://github.com/user-attachments/assets/a73e5fa3-73c1-45df-a873-39ccad36e9a7" /> <img width="674" height="598" alt="Screenshot 2025-09-08 at 16 53 58" src="https://github.com/user-attachments/assets/56b57da8-8fb4-48e9-b942-eb3f5679e284" /> </details> ## Run tests Start the server: `node scripts/scout.js start-server --stateful` Run the tests: `EVALUATION_CONNECTOR_ID=gemini-2_5-pro node scripts/playwright test --config x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/playwright.config.ts --grep "execute_connector function"` --------- Co-authored-by: kibanamachine <[email protected]>
1 parent abc2b35 commit 07e9b9f

File tree

6 files changed

+286
-7
lines changed

6 files changed

+286
-7
lines changed
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
import { EXECUTE_CONNECTOR_FUNCTION_NAME } from '@kbn/observability-ai-assistant-plugin/common';
9+
import type { ActionConnector } from '@kbn/alerts-ui-shared/src/common/types';
10+
import { evaluate as base } from '../../src/evaluate';
11+
import type { EvaluateConnectorDataset } from './evaluate_connector_dataset';
12+
import { createEvaluateConnectorDataset } from './evaluate_connector_dataset';
13+
14+
/**
15+
* NOTE: This scenario has been migrated from the legacy evaluation framework.
16+
* - x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/connector/index.spec.ts
17+
* Any changes should be made in both places until the legacy evaluation framework is removed.
18+
*/
19+
20+
const EMAIL_PROMPT =
21+
'Send an email to [email protected] with the subject "Test Email" and body "This is a test email."';
22+
23+
const EMAIL_EVAL_CRITERIA = [
24+
`Uses the ${EXECUTE_CONNECTOR_FUNCTION_NAME} function to send the email before providing a final answer to the user.`,
25+
'Clearly explains to the user that an email will be sent and summarizes the provided details (recipient, subject, body).',
26+
'Confirms successful email delivery and includes the recipient, subject, and message in the summary.',
27+
'Does not include irrelevant or unrelated information in the response.',
28+
];
29+
30+
const evaluate = base.extend<{
31+
evaluateConnectorDataset: EvaluateConnectorDataset;
32+
}>({
33+
evaluateConnectorDataset: [
34+
({ chatClient, evaluators, phoenixClient }, use) => {
35+
use(
36+
createEvaluateConnectorDataset({
37+
chatClient,
38+
evaluators,
39+
phoenixClient,
40+
})
41+
);
42+
},
43+
{ scope: 'test' },
44+
],
45+
});
46+
47+
evaluate.describe('execute_connector function', { tag: '@svlOblt' }, () => {
48+
evaluate.describe('no email connector available', () => {
49+
evaluate(
50+
'does not send an email and fails gracefully',
51+
async ({ evaluateConnectorDataset }) => {
52+
await evaluateConnectorDataset({
53+
dataset: {
54+
name: 'connector: no email connector',
55+
description: 'Validate behavior when no Actions email connector exists.',
56+
examples: [
57+
{
58+
input: { prompt: EMAIL_PROMPT },
59+
output: {
60+
criteria: [
61+
`Does not use ${EXECUTE_CONNECTOR_FUNCTION_NAME} function.`,
62+
'Explains that no connectors are available to send the email.',
63+
'Does not attempt to send an email.',
64+
'Mentions that sending the email was unsuccessful.',
65+
],
66+
},
67+
metadata: {},
68+
},
69+
],
70+
},
71+
});
72+
}
73+
);
74+
});
75+
76+
evaluate.describe('with email connector', () => {
77+
let emailConnectorId: string;
78+
79+
evaluate.beforeAll(async ({ kbnClient, log }) => {
80+
const { data } = await kbnClient.request<ActionConnector>({
81+
method: 'POST',
82+
path: '/api/actions/connector',
83+
body: {
84+
name: 'email-connector-test',
85+
config: {
86+
87+
service: '__json',
88+
},
89+
secrets: {
90+
user: 'test',
91+
password: '123456',
92+
},
93+
connector_type_id: '.email',
94+
},
95+
});
96+
log.success('Email connector created successfully');
97+
98+
emailConnectorId = data.id;
99+
});
100+
101+
evaluate('sends an email (basic)', async ({ evaluateConnectorDataset }) => {
102+
await evaluateConnectorDataset({
103+
dataset: {
104+
name: 'connector: with email connector (basic)',
105+
description:
106+
'Validates that the assistant uses execute_connector and summarizes correctly.',
107+
examples: [
108+
{
109+
input: { prompt: EMAIL_PROMPT },
110+
output: { criteria: EMAIL_EVAL_CRITERIA },
111+
metadata: {},
112+
},
113+
],
114+
},
115+
});
116+
});
117+
118+
evaluate(
119+
'sends an email using user instructions',
120+
async ({ evaluateConnectorDataset, kbnClient }) => {
121+
const instructions = `<email_instructions>
122+
If the user's query requires sending an email:
123+
1. Use the email connector type ".email" with ID "${emailConnectorId}".
124+
2. Prepare the email parameters:
125+
- Recipient email address(es) in the "to" field (array of strings)
126+
- Subject in the "subject" field (string)
127+
- Email body in the "message" field (string)
128+
3. Include
129+
- Details for the alert along with a link to the alert
130+
- Root cause analysis
131+
- All of the details we discussed in this conversation
132+
- Remediation recommendations
133+
- Link to Business Health Dashboard
134+
4. Execute the connector using this format:
135+
execute_connector(
136+
id="${emailConnectorId}",
137+
params={
138+
"to": ["[email protected]"],
139+
"subject": "Your Email Subject",
140+
"message": "Your email content here."
141+
}
142+
)
143+
5. Check the response and confirm if the email was sent successfully.
144+
</email_instructions>`;
145+
146+
await kbnClient.request({
147+
method: 'PUT',
148+
path: '/internal/observability_ai_assistant/kb/user_instructions',
149+
body: { id: 'send_email', text: instructions, public: false },
150+
});
151+
152+
await evaluateConnectorDataset({
153+
dataset: {
154+
name: 'connector: with email connector (user instructions)',
155+
description: 'Validates connector usage guided by user instructions.',
156+
examples: [
157+
{
158+
input: { prompt: EMAIL_PROMPT },
159+
output: { criteria: EMAIL_EVAL_CRITERIA },
160+
metadata: {},
161+
},
162+
],
163+
},
164+
});
165+
}
166+
);
167+
168+
evaluate.afterAll(async ({ kbnClient, log }) => {
169+
// Delete the email connector
170+
await kbnClient.request({
171+
method: 'DELETE',
172+
path: `/api/actions/connector/${emailConnectorId}`,
173+
});
174+
log.success('Email connector deleted');
175+
// Delete the user instructions
176+
await kbnClient.request({
177+
method: 'DELETE',
178+
path: '/internal/observability_ai_assistant/kb/entries/send_email',
179+
});
180+
log.success('User instructions deleted');
181+
});
182+
});
183+
});
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
import type { Example } from '@arizeai/phoenix-client/dist/esm/types/datasets';
9+
import type { DefaultEvaluators, KibanaPhoenixClient } from '@kbn/evals';
10+
import type { EvaluationDataset } from '@kbn/evals/src/types';
11+
import type { ObservabilityAIAssistantEvaluationChatClient } from '../../src/chat_client';
12+
13+
interface ConnectorExample extends Example {
14+
input: {
15+
prompt: string;
16+
};
17+
output: {
18+
criteria: string[];
19+
};
20+
}
21+
22+
export type EvaluateConnectorDataset = ({
23+
dataset: { name, description, examples },
24+
}: {
25+
dataset: {
26+
name: string;
27+
description: string;
28+
examples: ConnectorExample[];
29+
};
30+
}) => Promise<void>;
31+
32+
export function createEvaluateConnectorDataset({
33+
evaluators,
34+
phoenixClient,
35+
chatClient,
36+
}: {
37+
evaluators: DefaultEvaluators;
38+
phoenixClient: KibanaPhoenixClient;
39+
chatClient: ObservabilityAIAssistantEvaluationChatClient;
40+
}): EvaluateConnectorDataset {
41+
return async function evaluateConnectorDataset({
42+
dataset: { name, description, examples },
43+
}: {
44+
dataset: {
45+
name: string;
46+
description: string;
47+
examples: ConnectorExample[];
48+
};
49+
}) {
50+
const dataset = {
51+
name,
52+
description,
53+
examples,
54+
} satisfies EvaluationDataset;
55+
56+
await phoenixClient.runExperiment(
57+
{
58+
dataset,
59+
task: async ({ input }) => {
60+
const response = await chatClient.complete({
61+
messages: input.prompt,
62+
});
63+
64+
return {
65+
errors: response.errors,
66+
messages: response.messages,
67+
};
68+
},
69+
},
70+
[
71+
{
72+
name: 'connector-evaluator',
73+
kind: 'LLM',
74+
evaluate: async ({ input, output, expected, metadata }) => {
75+
const result = await evaluators
76+
.criteria(expected.criteria ?? [])
77+
.evaluate({ input, expected, output, metadata });
78+
79+
return result;
80+
},
81+
},
82+
]
83+
);
84+
};
85+
}

x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/src/evaluate.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,12 @@ export const evaluate = base.extend<
2727
},
2828
],
2929
chatClient: [
30-
async ({ fetch, log, connector }, use, testInfo) => {
31-
const chatClient = new ObservabilityAIAssistantEvaluationChatClient(fetch, log, connector.id);
30+
async ({ fetch, log, connector, knowledgeBaseClient }, use) => {
31+
// Ensure the KB fixture is initialized before creating the chat client.
32+
// This guarantees KB is installed even if no spec references knowledgeBaseClient directly.
33+
await knowledgeBaseClient.ensureInstalled();
3234

35+
const chatClient = new ObservabilityAIAssistantEvaluationChatClient(fetch, log, connector.id);
3336
await use(chatClient);
3437
},
3538
{

x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/src/knowledge_base_client.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,10 @@ export class KnowledgeBaseClient {
3030
const response = await this.fetch<{}>('/internal/observability_ai_assistant/kb/setup', {
3131
method: 'POST',
3232
query: {
33+
inference_id: '.elser-2-elasticsearch',
3334
wait_until_complete: true,
3435
},
35-
body: JSON.stringify({
36-
query: {
37-
inference_id: '.elser-2-elasticsearch',
38-
},
39-
}),
36+
body: JSON.stringify({}),
4037
});
4138

4239
this.log.info('Knowledge base is ready');

x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/tsconfig.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,6 @@
2222
"@kbn/core",
2323
"@kbn/core-http-browser",
2424
"@kbn/scout-oblt",
25+
"@kbn/alerts-ui-shared",
2526
]
2627
}

x-pack/solutions/observability/plugins/observability_ai_assistant_app/scripts/evaluation/scenarios/connector/index.spec.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77

88
/// <reference types="@kbn/ambient-ftr-types"/>
99

10+
/**
11+
* NOTE: This scenario has been migrated to the new evaluation framework.
12+
* - x-pack/solutions/observability/packages/kbn-evals-suite-obs-ai-assistant/evals/connector/connector.spec.ts
13+
* Any changes should be made in both places until the legacy evaluation framework is removed.
14+
*/
15+
1016
import expect from '@kbn/expect';
1117
import { EXECUTE_CONNECTOR_FUNCTION_NAME } from '@kbn/observability-ai-assistant-plugin/common';
1218
import { chatClient, kibanaClient, logger } from '../../services';
@@ -124,6 +130,10 @@ describe('execute_connector function', () => {
124130
pathname: `/api/actions/connector/${emailConnectorId}`,
125131
});
126132
logger.success('Email connector deleted');
133+
// delete the user instructions
134+
await fetch('/internal/observability_ai_assistant/kb/entries/send_email', {
135+
method: 'DELETE',
136+
});
127137
});
128138
});
129139
});

0 commit comments

Comments
 (0)