Add devcontainer for running the sample

jamesmcroft · jamesmcroft · commit b8df33557d67 · 2024-08-01T15:05:31.000Z
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,24 @@
+# Use a base image that supports Python.
+FROM mcr.microsoft.com/vscode/devcontainers/python:1-3.12-bullseye
+
+# Install Python dependencies
+COPY requirements.txt /tmp/pip-tmp/
+RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
+    && rm -rf /tmp/pip-tmp
+
+# Install additional tools and dependencies
+RUN apt-get update \
+    && apt-get upgrade -y \
+    && export DEBIAN_FRONTEND=noninteractive \
+    && apt-get -y install --no-install-recommends unzip jq poppler-utils
+
+# Install yq
+RUN wget -qO /usr/local/bin/yq "https://github.com/mikefarah/yq/releases/download/v4.25.1/yq_linux_amd64" \
+    && chmod +x /usr/local/bin/yq
+
+# Default to bash shell
+ENV SHELL=/bin/bash \
+    DOCKER_BUILDKIT=1
+
+# Mount for docker-in-docker 
+VOLUME [ "/var/lib/docker" ]
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,55 @@
+{
+  "name": "Document Data Extraction Prompt Flow Evaluation",
+  "build": {
+    "dockerfile": "Dockerfile",
+    "context": ".."
+  },
+  "features": {
+    "ghcr.io/devcontainers/features/git:1": {
+      "version": "latest",
+      "ppa": "false"
+    },
+    "ghcr.io/devcontainers/features/azure-cli:1": {},
+    "ghcr.io/azure/azure-dev/azd:0": {},
+    "ghcr.io/devcontainers/features/git-lfs:1": {
+      "version": "latest"
+    },
+    "ghcr.io/devcontainers/features/github-cli:1": {
+      "version": "latest"
+    },
+    "ghcr.io/devcontainers/features/docker-in-docker:2": {
+      "version": "latest"
+    },
+    "./local-features/setup": "latest"
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/devcontainers/features/git",
+    "ghcr.io/devcontainers/features/azure-cli",
+    "ghcr.io/azure/azure-dev/azd",
+    "./local-features/setup",
+    "ghcr.io/devcontainers/features/git-lfs",
+    "ghcr.io/devcontainers/features/github-cli",
+    "ghcr.io/devcontainers/features/docker-in-docker"
+  ],
+  "remoteUser": "vscode",
+  "containerUser": "vscode",
+  "forwardPorts": [],
+  "otherPortsAttributes": {
+    "onAutoForward": "ignore"
+  },
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.vscode-pylance",
+        "ms-python.python",
+        "ms-python.debugpy",
+        "ms-toolsai.jupyter",
+        "tomoki1207.pdf",
+        "ms-azuretools.vscode-bicep",
+        "ms-vscode.vscode-node-azure-pack",
+        "GitHub.vscode-pull-request-github",
+        "prompt-flow.prompt-flow"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/local-features/setup/devcontainer-feature.json b/.devcontainer/local-features/setup/devcontainer-feature.json
@@ -0,0 +1,11 @@
+{
+    "id": "local-setup",
+    "name": "Setup for Local Environment",
+    "installsAfter": [
+        "ghcr.io/devcontainers/features/azure-cli"
+    ],
+    "install": {
+        "app": "",
+        "file": "install.sh"
+    }
+}
diff --git a/.devcontainer/local-features/setup/install.sh b/.devcontainer/local-features/setup/install.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+USERNAME=${USERNAME:-"vscode"}
+
+set -eux
+
+if [ "$(id -u)" -ne 0 ]; then
+    echo -e 'Script must be run as root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.'
+    exit 1
+fi
+
+export DEBIAN_FRONTEND=noninteractive
+
+sudo_if() {
+    COMMAND="$*"
+    if [ "$(id -u)" -eq 0 ] && [ "$USERNAME" != "root" ]; then
+        su - "$USERNAME" -c "$COMMAND"
+    else
+        "$COMMAND"
+    fi
+}
+
+install_azcli_extension() {
+    EXTENSION_NAME=$1
+
+    sudo_if "az extension add -n $EXTENSION_NAME"
+    sudo_if "az extension update -n $EXTENSION_NAME"
+}
+
+# Install the Azure CLI Machine Learning extension
+install_azcli_extension ml
+
+# Register the Bash Kernel with Jupyter
+sudo_if "python3 -m bash_kernel.install"
diff --git a/.gitignore b/.gitignore
@@ -168,3 +168,4 @@ cython_debug/
 
 # Outputs
 *Outputs.json
+tests/**/*.jpg
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,13 @@
+{
+    "recommendations": [
+        "ms-python.vscode-pylance",
+        "ms-python.python",
+        "ms-python.debugpy",
+        "ms-toolsai.jupyter",
+        "tomoki1207.pdf",
+        "ms-azuretools.vscode-bicep",
+        "ms-vscode.vscode-node-azure-pack",
+        "GitHub.vscode-pull-request-github",
+        "prompt-flow.prompt-flow"
+    ]
+}
diff --git a/README.md b/README.md
@@ -1 +1,45 @@
 # Document Data Extraction with GPT-4o and Evaluation using Prompt Flow
+
+This sample demonstrates [how to use GPT-4o](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4o-and-gpt-4-turbo) to extract structured JSON data from PDF documents and evaluate the extracted data using the [Prompt Flow](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/flow-bulk-test-evaluation) feature in Azure AI Studio.
+
+The approach builds on the understanding that [Azure OpenAI GPT-4o is effective at analyzing document images and extracting structured JSON objects](https://github.com/Azure-Samples/azure-openai-gpt-4-vision-pdf-extraction-sample) from them based on a provided extraction prompt including an expected output schema. The approach for evaluating document data extraction with Prompt Flow in Azure AI Studio highlights the following advantages:
+
+- **Automated evaluation**: Custom Prompt Flow evaluations allow you to create an automated run which can evaluate multiple test cases in parallel, providing a comprehensive report and analysis of all the results in one place.
+- **Prompt engineering testing**: Similar to creating traditional test cases for code, you can create various extraction prompt scenarios to evaluate changes in the prompt's performance. This can include variations on the schema, GPT model parameters, and the rules for extracting data.
+- **Simplicity**: The approach using Prompt Flow narrows the scope of data extraction evaluation to discrete tasks in your AI application's workflow, making it easier to evaluate and improve the performance of your extraction prompts in a controlled environment before integrating the changes into your application.
+
+The provided [Sample notebook](./Sample.ipynb) provides all the necessary steps to deploy the infrastructure and run the sample in your Azure subscription. It provides a dedicated learning environment for you to understand how to use GPT-4o for document data extraction and evaluate the extracted data using Prompt Flow in Azure AI Studio.
+
+> [!IMPORTANT]
+> Running the evaluation prompt flow for each test case with GPT-4o accrues token-based charges as would be expected running this in application code. Images are converted into tokens by converting your high resolution images into separate 512px tiled images. For more information, see the [Azure OpenAI image token overview](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview#image-tokens-gpt-4-turbo-with-vision).
+
+## Getting Started
+
+### Prerequisites
+
+The sample repository comes with a [**Dev Container**](https://code.visualstudio.com/docs/remote/containers) that contains all the necessary tools and dependencies to run the sample. To use the Dev Container, you need to have the following tools installed on your local machine:
+
+- Install [**Visual Studio Code**](https://code.visualstudio.com/download)
+- Install [**Docker Desktop**](https://www.docker.com/products/docker-desktop)
+- Install [**Remote - Containers**](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension for Visual Studio Code
+
+### Run the sample notebook
+
+Before running the notebook, open the project in Visual Studio Code and start the Dev Container. This will ensure that all the necessary dependencies are installed and the environment is ready to run the notebook.
+
+Once the Dev Container is running, open the [**Sample.ipynb**](./Sample.ipynb) notebook and follow the instructions in the notebook to run the sample.
+
+> [!NOTE]
+> The sample will guide you through the process of deploying the necessary infrastructure, deploying the Prompt Flows to the Azure AI Studio, and finally running the evaluation for the document data extraction.
+
+### Clean up resources
+
+After you have finished running the sample, you can clean up the resources using the following steps:
+
+1. Run the `az group delete` command to delete the resource group and all the resources within it.
+
+```bash
+az group delete --name <resource-group-name> --yes --no-wait
+```
+
+The `<resource-group-name>` is the name of the resource group that can be found in the **resourceGroupInfo** JSON object in the [**EnvironmentOutputs.json**](./EnvironmentOutputs.json) file created after running the Sample notebook.
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,9 @@
 azure-ai-resources==1.0.0.b8
 azure-identity==1.17.1
+bash_kernel==0.9.3
 ipykernel==6.29.4
 notebook==7.2.1
+pdf2image==1.17.0
 promptflow==1.13.0
 promptflow-tools==1.4.0
 python-dotenv==1.0.1

Original file line number	Diff line number	Diff line change
`@@ -168,3 +168,4 @@ cython_debug/`
`168`	`168`
`169`	`169`	`# Outputs`
`170`	`170`	`*Outputs.json`
	`171`	`+tests/*/.jpg`