Azure-Samples · pamelafox · Oct 9, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -19,7 +19,9 @@
         "ghcr.io/azure/azure-dev/azd:latest": {
 			"version": "1.10.1"
 		},
-		"ghcr.io/prulloac/devcontainer-features/ollama:1": {}
+		"ghcr.io/prulloac/devcontainer-features/ollama:1": {},
+		// az CLI is helpful for being able to login correctly with DefaultAzureCredential:
+		"ghcr.io/devcontainers/features/azure-cli": {}
     },
 	// Configure tool-specific properties.
 	"customizations": {
@@ -46,10 +48,6 @@
 						"source.fixAll": "explicit"
 					},
 					"editor.defaultFormatter": "charliermarsh.ruff"
-				},
-				"files.exclude": {
-					".ruff_cache": true,
-					".pytest_cache": true
 				}
 			}
 		}

diff --git a/.env.sample b/.env.sample
@@ -13,12 +13,15 @@ OPENAI_EMBED_HOST=azure
 # You also need to `azd auth login` if running this locally
 AZURE_OPENAI_ENDPOINT=https://YOUR-AZURE-OPENAI-SERVICE-NAME.openai.azure.com
 AZURE_OPENAI_VERSION=2024-03-01-preview
-AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
-AZURE_OPENAI_CHAT_MODEL=gpt-35-turbo
+AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-4o-mini
+AZURE_OPENAI_CHAT_MODEL=gpt-4o-mini
 AZURE_OPENAI_EMBED_DEPLOYMENT=text-embedding-ada-002
 AZURE_OPENAI_EMBED_MODEL=text-embedding-ada-002
-AZURE_OPENAI_EMBED_MODEL_DIMENSIONS=1536
+AZURE_OPENAI_EMBED_DIMENSIONS=1536
 AZURE_OPENAI_EMBEDDING_COLUMN=embedding_ada002
+AZURE_OPENAI_EVAL_DEPLOYMENT=gpt-4
+AZURE_OPENAI_EVAL_MODEL=gpt-4
+AZURE_TENANT_ID=
 # Only needed when using key-based Azure authentication:
 AZURE_OPENAI_KEY=
 # Needed for OpenAI.com:

diff --git a/.github/workflows/app-tests.yaml b/.github/workflows/app-tests.yaml
@@ -80,6 +80,11 @@ jobs:
             cd ./src/frontend
             npm install
             npm run build
+        - name: cache mypy
+          uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
+          with:
+            path: ./.mypy_cache
+            key: mypy${{ matrix.os }}-${{ matrix.python_version }}-${{ hashFiles('requirements-dev.txt', 'src/backend/requirements.txt', 'src/backend/pyproject.toml') }}
         - name: Run MyPy
           run: python3 -m mypy .
         - name: Run Pytest

diff --git a/.github/workflows/azure-dev.yaml b/.github/workflows/azure-dev.yaml
@@ -3,8 +3,7 @@ name: Deploy with azd
 on:
   workflow_dispatch:
   push:
-    branches:
-      - main
+    branches: [ main ]
 
 # GitHub Actions workflow to deploy to Azure using azd
 # To configure required secrets for connecting to Azure, simply run `azd pipeline config`

diff --git a/.github/workflows/evaluate.yaml b/.github/workflows/evaluate.yaml
@@ -0,0 +1,170 @@
+name: Evaluate
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+# Set up permissions for deploying with secretless Azure federated credentials
+# https://learn.microsoft.com/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  evaluate:
+    if: github.event_name == 'workflow_dispatch' || contains(github.event.comment.body, '#evaluate')
+    runs-on: ubuntu-latest
+    env:
+      AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }}
+      AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }}
+      AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
+      AZURE_CREDENTIALS: ${{ secrets.AZURE_CREDENTIALS }}
+      AZURE_RESOURCE_GROUP: ${{ vars.AZURE_RESOURCE_GROUP }}
+    steps:
+      - name: Check for evaluate hash tag
+        if: contains(github.event.comment.body, '#evaluate')
+        run: |
+          echo "Comment contains #evaluate hashtag"
+
+      - uses: actions/checkout@v4
+      - name: Install PostgreSQL development libraries
+        run: |
+          sudo apt update
+          sudo apt install postgresql-server-dev-14
+      - name: Setup postgres
+        uses: ikalnytskyi/action-setup-postgres@v6
+        with:
+          username: admin
+          password: postgres
+          database: postgres
+
+      - name: Install pgvector on MacOS/Linux using install-pgvector.sh
+        run: .github/workflows/install-pgvector.sh
+
+      - name: Install python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install azd
+        uses: Azure/setup-azd@v1.0.0
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -r requirements-dev.txt
+
+      - name: Install app as editable app
+        run: |
+          python -m pip install -e src/backend
+
+      - name: Setup local database with seed data
+        run: |
+          python ./src/backend/fastapi_app/setup_postgres_database.py
+          python ./src/backend/fastapi_app/setup_postgres_seeddata.py
+        env:
+          POSTGRES_HOST: localhost
+          POSTGRES_USERNAME: admin
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DATABASE: postgres
+          POSTGRES_SSL: disable
+
+      - name: Setup node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 18
+
+      - name: Build frontend
+        run: |
+          cd ./src/frontend
+          npm install
+          npm run build
+
+      - name: Install python packages
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt
+
+      - name: Login to Azure
+        uses: azure/login@v2
+        with:
+          client-id: ${{ env.AZURE_CLIENT_ID }}
+          tenant-id: ${{ env.AZURE_TENANT_ID }}
+          subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Set az account
+        uses: azure/CLI@v2
+        with:
+          inlineScript: |
+            az account set --subscription ${{env.AZURE_SUBSCRIPTION_ID}}
+
+      - name: Log in with Azure (Federated Credentials)
+        if: ${{ env.AZURE_CLIENT_ID != '' }}
+        run: |
+          azd auth login `
+            --client-id "$Env:AZURE_CLIENT_ID" `
+            --federated-credential-provider "github" `
+            --tenant-id "$Env:AZURE_TENANT_ID"
+        shell: pwsh
+
+      - name: Provision Infrastructure
+        run: azd provision --no-prompt
+        env:
+          AZD_INITIAL_ENVIRONMENT_CONFIG: ${{ secrets.AZD_INITIAL_ENVIRONMENT_CONFIG }}
+
+      - name: Run local server in background
+        run: |
+          RUNNER_TRACKING_ID="" && (nohup python3 -m uvicorn fastapi_app:create_app --factory > serverlogs.out 2> serverlogs.err &)
+        env:
+          OPENAI_CHAT_HOST: ${{ vars.OPENAI_CHAT_HOST }}
+          OPENAI_EMBED_HOST: ${{ vars.OPENAI_EMBED_HOST }}
+          AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_VERSION: ${{ vars.AZURE_OPENAI_VERSION }}
+          AZURE_OPENAI_CHAT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT }}
+          AZURE_OPENAI_CHAT_MODEL: ${{ vars.AZURE_OPENAI_CHAT_MODEL }}
+          AZURE_OPENAI_EMBED_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EMBED_DEPLOYMENT }}
+          AZURE_OPENAI_EMBED_MODEL: ${{ vars.AZURE_OPENAI_EMBED_MODEL }}
+          AZURE_OPENAI_EMBED_DIMENSIONS: ${{ vars.AZURE_OPENAI_EMBED_DIMENSIONS }}
+          AZURE_OPENAI_EMBEDDING_COLUMN: ${{ vars.AZURE_OPENAI_EMBEDDING_COLUMN }}
+          POSTGRES_HOST: localhost
+          POSTGRES_USERNAME: admin
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DATABASE: postgres
+          POSTGRES_SSL: disable
+      - name: Evaluate local RAG flow
+        run: |
+          python evals/evaluate.py
+        env:
+          OPENAI_CHAT_HOST: ${{ vars.OPENAI_CHAT_HOST }}
+          AZURE_OPENAI_ENDPOINT: ${{ vars.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_VERSION: ${{ vars.AZURE_OPENAI_VERSION }}
+          AZURE_OPENAI_CHAT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT }}
+          AZURE_OPENAI_CHAT_MODEL: ${{ vars.AZURE_OPENAI_CHAT_MODEL }}
+          AZURE_OPENAI_EVAL_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT }}
+          AZURE_OPENAI_EVAL_MODEL: ${{ vars.AZURE_OPENAI_EVAL_MODEL }}
+      - name: Upload server logs as build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: server_logs
+          path: ./serverlogs.out
+
+      - name: Upload server error logs as build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: server_error_logs
+          path: ./serverlogs.err
+
+      - name: Upload eval results as build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval_result
+          path: ./src/api/evaluate/eval_results.jsonl
+
+      - name: GitHub Summary Step
+        if: ${{ success() }}
+        working-directory: ./src/api
+        run: |
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          echo "📊 Promptflow Evaluation Results" >> $GITHUB_STEP_SUMMARY
+          cat evaluate/eval_results.md >> $GITHUB_STEP_SUMMARY
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -27,5 +27,13 @@
         "tests"
     ],
     "python.testing.unittestEnabled": false,
-    "python.testing.pytestEnabled": true
+    "python.testing.pytestEnabled": true,
+    "files.exclude": {
+        ".ruff_cache": true,
+        ".pytest_cache": true,
+        "__pycache__": true,
+        "htmlcov": true,
+        ".mypy_cache": true,
+        ".coverage": true
+    }
 }
diff --git a/README.md b/README.md
@@ -123,7 +123,7 @@ Once you've opened the project in [Codespaces](#github-codespaces), [Dev Contain
     azd up
     ```
 
-    You will be asked to select two locations, first a region for most of the resources (Container Apps, PostgreSQL), then a region specifically for the Azure OpenAI models. This project uses the gpt-3.5-turbo (version 0125) and text-embedding-ada-002 models which may not be available in all Azure regions. Check for [up-to-date region availability](https://learn.microsoft.com/azure/ai-services/openai/concepts/models#standard-deployment-model-availability) and select a region accordingly.
+    You will be asked to select two locations, first a region for most of the resources (Container Apps, PostgreSQL), then a region specifically for the Azure OpenAI models. This project uses the gpt-4o-mini and text-embedding-ada-002 models which may not be available in all Azure regions. Check for [up-to-date region availability](https://learn.microsoft.com/azure/ai-services/openai/concepts/models#standard-deployment-model-availability) and select a region accordingly.
 
 ## Local Development
 
@@ -207,6 +207,7 @@ Further documentation is available in the `docs/` folder:
 * [Using Entra auth with PostgreSQL tools](docs/using_entra_auth.md)
 * [Monitoring with Azure Monitor](docs/monitoring.md)
 * [Load testing](docs/loadtesting.md)
+* [Evaluation](docs/evaluation.md)
 
 Please post in the issue tracker with any questions or issues.
 

diff --git a/azure.yaml b/azure.yaml
@@ -39,12 +39,21 @@ hooks:
 pipeline:
   variables:
     - DEPLOY_AZURE_OPENAI
-    - AZURE_OPENAI_CHAT_MODEL
+    - OPENAI_CHAT_HOST
+    - OPENAI_EMBED_HOST
+    - AZURE_OPENAI_ENDPOINT
+    - AZURE_OPENAI_VERSION
     - AZURE_OPENAI_CHAT_DEPLOYMENT
+    - AZURE_OPENAI_CHAT_MODEL
     - AZURE_OPENAI_CHAT_DEPLOYMENT_VERSION
     - AZURE_OPENAI_CHAT_DEPLOYMENT_CAPACITY
-    - AZURE_OPENAI_EMBED_MODEL
+    - AZURE_OPENAI_EVAL_DEPLOYMENT_SKU
     - AZURE_OPENAI_EMBED_DEPLOYMENT
+    - AZURE_OPENAI_EMBED_MODEL
     - AZURE_OPENAI_EMBED_DEPLOYMENT_VERSION
     - AZURE_OPENAI_EMBED_DEPLOYMENT_CAPACITY
+    - AZURE_OPENAI_EMBED_DEPLOYMENT_SKU
     - AZURE_OPENAI_EMBED_DIMENSIONS
+    - AZURE_OPENAI_EMBEDDING_COLUMN
+    - AZURE_OPENAI_EVAL_DEPLOYMENT
+    - AZURE_OPENAI_EVAL_MODEL
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -0,0 +1,75 @@
+# Evaluating the RAG answer quality
+
+## Deploy a GPT-4 model
+
+
+1. Run this command to tell `azd` to deploy a GPT-4 model for evaluation:
+
+    ```shell
+    azd env set DEPLOY_EVAL_MODEL true
+    ```
+
+2. Set the capacity to the highest possible value to ensure that the evaluation runs quickly.
+
+    ```shell
+    azd env set AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY 100
+    ```
+
+    By default, that will provision a `gpt-4` model, version `turbo-2024-04-09`. To change those settings, set the `AZURE_OPENAI_EVAL_DEPLOYMENT` and `AZURE_OPENAI_EVAL_DEPLOYMENT_VERSION` environment variables.
+
+3. Then, run the following command to provision the model:
+
+    ```shell
+    azd provision
+    ```
+
+## Setup the evaluation environment
+
+Install all the dependencies for the evaluation script by running the following command:
+
+```bash
+pip install -r requirements-dev.txt
+```
+
+## Generate ground truth data
+
+Generate ground truth data by running the following command:
+
+```bash
+python evals/generate.py
+```
+
+Review the generated data after running that script, removing any question/answer pairs that don't seem like realistic user input.
+
+## Evaluate the RAG answer quality
+
+Review the configuration in `evals/eval_config.json` to ensure that everything is correctly setup. You may want to adjust the metrics used. [TODO: link to evaluator docs]
+
+By default, the evaluation script will evaluate every question in the ground truth data.
+Run the evaluation script by running the following command:
+
+```bash
+python evals/evaluate.py
+```
+
+## Review the evaluation results
+
+The evaluation script will output a summary of the evaluation results, inside the `evals/results` directory.
+
+You can see a summary of results across all evaluation runs by running the following command:
+
+```bash
+python -m evaltools summary evals/results
+```
+
+Compare answers across runs by running the following command:
+
+```bash
+python -m evaltools diff evals/results/baseline/
+```
+
+## Run the evaluation in GitHub actions
+
+
+# TODO: Add GPT-4 deployment with high capacity for evaluation
+# TODO: Add CI workflow that can be triggered to run the evaluate on the local app
diff --git a/evals/eval_config.json b/evals/eval_config.json
@@ -0,0 +1,16 @@
+{
+    "testdata_path": "ground_truth.jsonl",
+    "results_dir": "results/experiment<TIMESTAMP>",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citation_match"],
+    "target_url": "http://127.0.0.1:8000/chat",
+    "target_parameters": {
+        "overrides": {
+            "use_advanced_flow": true,
+            "top": 3,
+            "retrieval_mode": "hybrid",
+            "temperature": 0.3
+        }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points"
+}