Update namespaces, add YAML eval methods, and docs

Adam Holm · Adam Holm · commit c852b1d397dd · 2024-09-06T19:51:38.000-05:00
Updated namespaces to HillPhelmuth.SemanticKernel.LlmAsJudgeEvals across multiple files. Added new methods in EvalService.cs for handling evaluation functions from YAML. Updated dependencies in PromptFlowEvalsAsPlugins.csproj to version 1.18.2. Added a new Readme.md with detailed documentation.
diff --git a/Demo/PromptFlowEvalsAsPlugins.Demo/Components/EvalInputs/EvalDisplay.razor b/Demo/PromptFlowEvalsAsPlugins.Demo/Components/EvalInputs/EvalDisplay.razor
@@ -1,4 +1,5 @@
-﻿<RadzenDataGrid @ref=_grid Data="EvalResultDisplays" TItem="EvalResultDisplay" AllowSorting=true AllowFiltering="true" FilterCaseSensitivity="FilterCaseSensitivity.CaseInsensitive" Density="Density.Compact" AllowPaging=true PageSize="12">
+﻿@using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals
+<RadzenDataGrid @ref=_grid Data="EvalResultDisplays" TItem="EvalResultDisplay" AllowSorting=true AllowFiltering="true" FilterCaseSensitivity="FilterCaseSensitivity.CaseInsensitive" Density="Density.Compact" AllowPaging=true PageSize="12">
 	<Template Context="display">
 		<RadzenTabs>
 			<Tabs>
diff --git a/Demo/PromptFlowEvalsAsPlugins.Demo/Components/EvalInputs/QnAGenerator.razor.cs b/Demo/PromptFlowEvalsAsPlugins.Demo/Components/EvalInputs/QnAGenerator.razor.cs
@@ -2,6 +2,7 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 namespace PromptFlowEvalsAsPlugins.Demo.Components.EvalInputs;
 
diff --git a/Demo/PromptFlowEvalsAsPlugins.Demo/Components/EvalsRag/AddRagContent.razor.cs b/Demo/PromptFlowEvalsAsPlugins.Demo/Components/EvalsRag/AddRagContent.razor.cs
@@ -2,6 +2,7 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 namespace PromptFlowEvalsAsPlugins.Demo.Components.EvalsRag;
 
diff --git a/Demo/PromptFlowEvalsAsPlugins.Demo/EvalManager.cs b/Demo/PromptFlowEvalsAsPlugins.Demo/EvalManager.cs
@@ -1,5 +1,6 @@
 ﻿using System.Text;
 using System.Text.Json;
+using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
diff --git a/PromptFlowEvalsAsPlugins/EvalService.cs b/PromptFlowEvalsAsPlugins/EvalService.cs
@@ -1,14 +1,13 @@
-﻿using Microsoft.Extensions.DependencyInjection;
+﻿using System.Text.Json;
+using System.Text.Json.Serialization;
+using Microsoft.Extensions.DependencyInjection;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
 using Microsoft.SemanticKernel.TextGeneration;
-using System.Reflection;
-using System.Text.Json.Serialization;
-using System.Text.Json;
-using Azure.AI.OpenAI;
+using OpenAI.Chat;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 public class EvalService
 {
@@ -33,7 +32,19 @@ public void AddEvalFunction(string name, KernelFunction function, bool overrideE
 		else
 			EvalFunctions.TryAdd(name, function);
 	}
-	public async Task<ResultScore> ExecuteEval(IInputModel inputModel)
+	public void AddEvalFunctionFromYaml(Stream yamlStream, string name, bool overrideExisting = false)
+    {
+        var yamlText = new StreamReader(yamlStream).ReadToEnd();
+        var function = _kernel.CreateFunctionFromPromptYaml(yamlText);
+        AddEvalFunction(name, function, overrideExisting);
+    }
+
+    public void AddEvalFunctionFromYaml(string yamlText, string name, bool overrideExisting = false)
+    {
+		var function = _kernel.CreateFunctionFromPromptYaml(yamlText);
+        AddEvalFunction(name, function, overrideExisting);
+    }
+    public async Task<ResultScore> ExecuteEval(IInputModel inputModel)
 	{
 		var currentKernel = _kernel.Clone();
 		if (currentKernel.Services.GetService<IChatCompletionService>() is null && currentKernel.Services.GetService<ITextGenerationService>() is null)
@@ -51,11 +62,11 @@ public async Task<ResultScore> ExecuteEval(IInputModel inputModel)
 			Logprobs = true,
 			TopLogprobs = 5
 		};
-
+		
 		var kernelArgs = new KernelArguments(inputModel.RequiredInputs, new Dictionary<string, PromptExecutionSettings> { { "default", settings } });
 		var result = await currentKernel.InvokeAsync(evalPlugin[inputModel.FunctionName], kernelArgs);
-		var logProbs = result.Metadata?["LogProbabilityInfo"] as ChatChoiceLogProbabilityInfo;
-		var tokenStrings = logProbs.TokenLogProbabilityResults.AsTokenStrings()[0];
+		var logProbs = result.Metadata?["ContentTokenLogProbabilities"] as IReadOnlyList<ChatTokenLogProbabilityInfo>;
+		var tokenStrings = logProbs.AsTokenStrings()[0];
 		return new ResultScore(inputModel.FunctionName, tokenStrings);
 	}
 
@@ -90,8 +101,8 @@ public async Task<ResultScore> ExecuteScorePlusEval(IInputModel inputModel)
 		};
 		var finalArgs = new KernelArguments(inputModel.RequiredInputs, new Dictionary<string, PromptExecutionSettings> { { PromptExecutionSettings.DefaultServiceId, settings } });
 		var result = await kernel.InvokeAsync(evalPlugin[inputModel.FunctionName], finalArgs);
-		var logProbs = result.Metadata?["LogProbabilityInfo"] as ChatChoiceLogProbabilityInfo;
-		var tokenStrings = logProbs.TokenLogProbabilityResults.AsTokenStrings();
+		var logProbs = result.Metadata?["LogProbabilityInfo"] as IReadOnlyList<ChatTokenLogProbabilityInfo>;
+		var tokenStrings = logProbs?.AsTokenStrings();
 		var scoreResult = result.GetTypedResult<ScorePlusResponse>();
 		return new ResultScore(inputModel.FunctionName, scoreResult, tokenStrings);
 	}
diff --git a/PromptFlowEvalsAsPlugins/EvalType.cs b/PromptFlowEvalsAsPlugins/EvalType.cs
@@ -1,4 +1,4 @@
-﻿namespace PromptFlowEvalsAsPlugins;
+﻿namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 /// <summary>
 /// Represents the evaluation type.
diff --git a/PromptFlowEvalsAsPlugins/Extentions.cs b/PromptFlowEvalsAsPlugins/Extentions.cs
@@ -1,13 +1,8 @@
-﻿using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Text.Json;
-using System.Threading.Tasks;
+﻿using System.Text.Json;
 using Microsoft.SemanticKernel;
-using static PromptFlowEvalsAsPlugins.Helpers;
+using static HillPhelmuth.SemanticKernel.LlmAsJudgeEvals.Helpers;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 public static class Extentions
 {
diff --git a/PromptFlowEvalsAsPlugins/Helpers.cs b/PromptFlowEvalsAsPlugins/Helpers.cs
@@ -1,8 +1,8 @@
-﻿using Microsoft.SemanticKernel;
-using System.Reflection;
+﻿using System.Reflection;
 using System.Text.Json;
+using Microsoft.SemanticKernel;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 public static class Helpers
 {
diff --git a/PromptFlowEvalsAsPlugins/IInputModel.cs b/PromptFlowEvalsAsPlugins/IInputModel.cs
@@ -1,6 +1,6 @@
 ﻿using Microsoft.SemanticKernel;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 public interface IInputModel
 {
diff --git a/PromptFlowEvalsAsPlugins/InputModels.cs b/PromptFlowEvalsAsPlugins/InputModels.cs
@@ -1,11 +1,6 @@
 ﻿using Microsoft.SemanticKernel;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 /// <summary>
 /// Represents an input model for evaluation.
diff --git a/PromptFlowEvalsAsPlugins/PromptFlowEvalsAsPlugins.csproj b/PromptFlowEvalsAsPlugins/PromptFlowEvalsAsPlugins.csproj
@@ -5,18 +5,21 @@
   <ImplicitUsings>enable</ImplicitUsings>
   <Nullable>enable</Nullable>
   <NoWarn>SKEXP0010</NoWarn>
-
-  <Title>PromptFlow Eval Flows or Custom Evals as SemanticKernel Plugins</Title>
+	 <RootNamespace>HillPhelmuth.SemanticKernel.LlmAsJudgeEvals</RootNamespace>
+  <Title>Run "LLM as Judge" Evals Using Semantic Kernel</Title>
   <PackageProjectUrl>https://github.com/HillPhelmuth/PromptFlowEvalsAsPlugins</PackageProjectUrl>
   <RepositoryUrl>https://github.com/HillPhelmuth/PromptFlowEvalsAsPlugins</RepositoryUrl>
   <PackageOutputPath>C:\Users\adamh\source\LocalPackages</PackageOutputPath>
   <GeneratePackageOnBuild>False</GeneratePackageOnBuild>
   <PackageIcon>packageIcon.png</PackageIcon>
+  <Version>0.0.1-beta</Version>
+  <Authors>HillPhelmuth</Authors>
+  <Description>Enable seamless execution of LLM (Large Language Model) evaluations using Semantic Kernel. This library provides tools and abstractions for running automated assessments where LLMs serve as judges, offering structured, consistent, and scalable evaluation methods. Ideal for AI-driven projects that require evaluative feedback, scoring, or comparative analysis across various use cases. Easily integrates with Semantic Kernel for smooth, flexible LLM operations in .NET environments.</Description>
 
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)' == 'Release'">
   <GeneratePackageOnBuild>True</GeneratePackageOnBuild>
-  <Version>0.0.3-beta</Version>
+  <Version>0.0.4-beta</Version>
  </PropertyGroup>
  <ItemGroup>
   <None Remove="EvalPluginYaml\Empathy.yaml" />
@@ -49,9 +52,9 @@
 
 
  <ItemGroup>
-  <PackageReference Include="Microsoft.SemanticKernel" Version="1.17.1" />
-  <PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.17.1-alpha" />
-  <PackageReference Include="Microsoft.SemanticKernel.Yaml" Version="1.17.1" />
+  <PackageReference Include="Microsoft.SemanticKernel" Version="1.18.2" />
+  <PackageReference Include="Microsoft.SemanticKernel.Plugins.Memory" Version="1.18.2-alpha" />
+  <PackageReference Include="Microsoft.SemanticKernel.Yaml" Version="1.18.2" />
  </ItemGroup>
 
  <ItemGroup>
diff --git a/PromptFlowEvalsAsPlugins/Readme.md b/PromptFlowEvalsAsPlugins/Readme.md
@@ -0,0 +1,74 @@
+# LlmAsJudgeEvals
+
+This library provides a service for evaluating responses from Large Language Models (LLMs) using the LLM itself as a judge. It leverages Semantic Kernel to define and execute evaluation functions based on prompt templates.
+
+## Installation
+
+Install the package via NuGet:
+
+```
+Install-Package HillPhelmuth.SemanticKernel.LlmAsJudgeEvals
+```
+
+## Usage
+
+### Built-in Evaluation Functions
+
+```csharp
+
+// Initialize the Semantic Kernel
+var kernel = Kernel.CreateBuilder().AddOpenAIChatCompletion("openai-model-name", "openai-apiKey").Build();
+
+// Create an instance of the EvalService
+var evalService = new EvalService(kernel);
+
+// Create an input model for the built-in evaluation function
+var coherenceInput = InputModel.CoherenceModel("This is the answer to evaluate.", "This is the question or prompt that generated the answer");
+
+// Execute the evaluation
+var result = await evalService.ExecuteEval(inputModel);
+
+
+Console.WriteLine($"Evaluation score: {result.Score}");
+
+```
+
+### Custom Evaluation Functions
+
+```csharp
+
+// Initialize the Semantic Kernel
+var kernel = Kernel.CreateBuilder().AddOpenAIChatCompletion("openai-model-name", "openai-apiKey").Build();
+
+// Create an instance of the EvalService
+var evalService = new EvalService(kernel);
+
+// Add an evaluation function (optional)
+evalService.AddEvalFunction("MyEvalFunction", "This is the prompt for my evaluation function.", new PromptExecutionSettings());
+
+// Create an input model for the evaluation function
+var inputModel = new InputModel
+{
+    FunctionName = "MyEvalFunction", // Replace with the name of your evaluation function
+    RequiredInputs = new Dictionary<string, string>
+    {
+        { "input", "This is the text to evaluate." }
+    }
+};
+
+// Execute the evaluation
+var result = await evalService.ExecuteEval(inputModel);
+
+
+Console.WriteLine($"Evaluation score: {result.Score}");
+```
+
+## Features
+
+* **Define evaluation functions using prompt templates:**  You can define evaluation functions using prompt templates written in YAML. 
+* **Execute evaluations:** The `EvalService` provides methods for executing evaluations on input data.
+* **Aggregate results:**  The `EvalService` can aggregate evaluation scores across multiple inputs.
+* **Built-in evaluation functions:** The package includes a set of pre-defined evaluation functions based on common evaluation metrics.
+
+
+
diff --git a/PromptFlowEvalsAsPlugins/ResultScore.cs b/PromptFlowEvalsAsPlugins/ResultScore.cs
@@ -1,6 +1,6 @@
 ﻿using Microsoft.SemanticKernel;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 public class ResultScore
 {
diff --git a/PromptFlowEvalsAsPlugins/TokenString.cs b/PromptFlowEvalsAsPlugins/TokenString.cs
@@ -1,6 +1,6 @@
-﻿using Azure.AI.OpenAI;
+﻿using OpenAI.Chat;
 
-namespace PromptFlowEvalsAsPlugins;
+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
 
 public record TokenString
 {
@@ -35,15 +35,15 @@ public override string ToString()
 }
 public static class LogProbExts
 {
-	public static List<TokenString> AsTokenStrings(this IEnumerable<ChatTokenLogProbabilityResult> logProbContentItems)
+	public static List<TokenString> AsTokenStrings(this IReadOnlyList<ChatTokenLogProbabilityInfo> logProbContentItems)
 	{
 		var result = new List<TokenString>();
 		foreach (var logProb in logProbContentItems)
 		{
 			var tokenString = new TokenString(logProb.Token, logProb.ToLinearProb());
-			if (logProb.TopLogProbabilityEntries is { Count: > 0 })
+			if (logProb.TopLogProbabilities is { Count: > 0 })
 			{
-				var innerResult = logProb.TopLogProbabilityEntries.Select(item => new TokenString(item.Token, item.ToLinearProb())).ToList();
+				var innerResult = logProb.TopLogProbabilities.Select(item => new TokenString(item.Token, item.ToLinearProb())).ToList();
 				tokenString.TopLogProbs = innerResult;
 			}
 			result.Add(tokenString);
@@ -87,7 +87,7 @@ public static IEnumerable<TokenProb> NormalizeValues(this IEnumerable<TokenProb>
 		}
 		//return tokenProbs.Select(token => new TokenProb(token.StringValue, token.Probability / sum));
 	}
-	public static double ToLinearProb(this ChatTokenLogProbabilityResult logProbabilityResult) => Math.Exp(logProbabilityResult.LogProbability);
+	public static double ToLinearProb(this ChatTokenTopLogProbabilityInfo logProbabilityResult) => Math.Exp(logProbabilityResult.LogProbability);
 
 	public static double ToLinearProb(this ChatTokenLogProbabilityInfo logProbInfo) => Math.Exp(logProbInfo.LogProbability);
 }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-namespace PromptFlowEvalsAsPlugins;`
	`1`	`+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;`
`2`	`2`
`3`	`3`	`/// <summary>`
`4`	`4`	`/// Represents the evaluation type.`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`using Microsoft.SemanticKernel;`
`2`	`2`
`3`		`-namespace PromptFlowEvalsAsPlugins;`
	`3`	`+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;`
`4`	`4`
`5`	`5`	`public interface IInputModel`
`6`	`6`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`		`-using Azure.AI.OpenAI;`
	`1`	`+using OpenAI.Chat;`
`2`	`2`
`3`		`-namespace PromptFlowEvalsAsPlugins;`
	`3`	`+namespace HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;`
`4`	`4`
`5`	`5`	`public record TokenString`
`6`	`6`	`{`
`@@ -35,15 +35,15 @@ public override string ToString()`
`35`	`35`	`}`
`36`	`36`	`public static class LogProbExts`
`37`	`37`	`{`
`38`		`- public static List<TokenString> AsTokenStrings(this IEnumerable<ChatTokenLogProbabilityResult> logProbContentItems)`
	`38`	`+ public static List<TokenString> AsTokenStrings(this IReadOnlyList<ChatTokenLogProbabilityInfo> logProbContentItems)`
`39`	`39`	`{`
`40`	`40`	`var result = new List<TokenString>();`
`41`	`41`	`foreach (var logProb in logProbContentItems)`
`42`	`42`	`{`
`43`	`43`	`var tokenString = new TokenString(logProb.Token, logProb.ToLinearProb());`
`44`		`- if (logProb.TopLogProbabilityEntries is { Count: > 0 })`
	`44`	`+ if (logProb.TopLogProbabilities is { Count: > 0 })`
`45`	`45`	`{`
`46`		`- var innerResult = logProb.TopLogProbabilityEntries.Select(item => new TokenString(item.Token, item.ToLinearProb())).ToList();`
	`46`	`+ var innerResult = logProb.TopLogProbabilities.Select(item => new TokenString(item.Token, item.ToLinearProb())).ToList();`
`47`	`47`	`tokenString.TopLogProbs = innerResult;`
`48`	`48`	`}`
`49`	`49`	`result.Add(tokenString);`
`@@ -87,7 +87,7 @@ public static IEnumerable<TokenProb> NormalizeValues(this IEnumerable<TokenProb>`
`87`	`87`	`}`
`88`	`88`	`//return tokenProbs.Select(token => new TokenProb(token.StringValue, token.Probability / sum));`
`89`	`89`	`}`
`90`		`- public static double ToLinearProb(this ChatTokenLogProbabilityResult logProbabilityResult) => Math.Exp(logProbabilityResult.LogProbability);`
	`90`	`+ public static double ToLinearProb(this ChatTokenTopLogProbabilityInfo logProbabilityResult) => Math.Exp(logProbabilityResult.LogProbability);`
`91`	`91`
`92`	`92`	`public static double ToLinearProb(this ChatTokenLogProbabilityInfo logProbInfo) => Math.Exp(logProbInfo.LogProbability);`
`93`	`93`	`}`