Add e2e test pipeline for promptflow-evals (#2647)

# Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes.
microsoft · Apr 9, 2024 · 5767b00 · 5767b00
1 parent d567042
commit 5767b00
Show file tree

Hide file tree

Showing 13 changed files with 488 additions and 154 deletions.
diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test.yml
@@ -0,0 +1,115 @@
+name: promptflow-evals-e2e-test
+
+on:
+  schedule:
+    - cron: "40 10 * * *" # 2:40 PST every day
+  pull_request:
+    paths:
+      - src/promptflow-evals/**
+      - .github/workflows/promptflow-evals-e2e-test.yml
+  workflow_dispatch:
+
+env:
+  IS_IN_CI_PIPELINE: "true"
+  WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
+  RECORD_DIRECTORY: ${{ github.workspace }}/src/promptflow-recording
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: snok/install-poetry@v1
+    - name: build
+      run: poetry build
+      working-directory: ${{ env.WORKING_DIRECTORY }}
+    - uses: actions/upload-artifact@v4
+      with:
+        name: promptflow-evals
+        path: ${{ env.WORKING_DIRECTORY }}/dist/promptflow_evals-*.whl
+
+  test:
+    needs: build
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        # TODO: Following up with PF team for the attribute error from 3.8 and 3.9.
+        python-version: ['3.10', '3.11']
+        #python-version: ['3.8', '3.9', '3.10', '3.11']
+      fail-fast: false
+    # snok/install-poetry need this to support Windows
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: set test mode
+        # Always run in replay mode for now until we figure out the test resource to run live mode
+        run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
+        #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: snok/install-poetry@v1
+      - uses: actions/download-artifact@v4
+        with:
+          name: promptflow-evals
+          path: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow packages in editable mode
+        run: |
+          poetry run pip install -e ../promptflow-core
+          poetry run pip install -e ../promptflow-devkit
+          poetry run pip install -e ../promptflow-tools
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow-evals from wheel
+        # wildcard expansion (*) does not work in Windows, so leverage python to find and install
+        run: poetry run pip install --pre $(python -c "import glob; print(glob.glob('promptflow_evals-*.whl')[0])")
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install test dependency group
+        run: poetry install --only test
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install recording
+        run: poetry install
+        working-directory: ${{ env.RECORD_DIRECTORY }}
+      - name: generate end-to-end test config from secret
+        # TODO: replace with evals secret
+        run: echo '${{ secrets.PF_TRACING_E2E_TEST_CONFIG }}' >> connections.json
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: run e2e tests
+        run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: upload coverage report
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ matrix.os }}-py${{ matrix.python-version }}
+          path: |
+            ${{ env.WORKING_DIRECTORY }}/*.xml
+            ${{ env.WORKING_DIRECTORY }}/htmlcov/
+
+  report:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+      issues: read
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      - uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          check_name: promptflow-evals test result
+          comment_title: promptflow-evals test result
+          files: "artifacts/**/test-results.xml"  # align with `--junit-xml` in pyproject.toml
+      - uses: irongut/CodeCoverageSummary@v1.3.0
+        with:
+          filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
+          badge: true
+          fail_below_min: false
+          format: markdown
+          hide_complexity: true
+          output: both
+          thresholds: 40 80
diff --git a/scripts/dev-setup/test_resources.py b/scripts/dev-setup/test_resources.py
@@ -35,6 +35,24 @@ def create_tracing_test_resource_template() -> None:
     _prompt_user_for_test_resources(connections_file_path)
 
 
+def create_evals_test_resource_template() -> None:
+    working_dir = REPO_ROOT_DIR / "src" / "promptflow-evals"
+    connections_filename = "connections.json"
+    connections_file_path = (working_dir / connections_filename).resolve().absolute()
+    connections_template = {
+        "azure_open_ai_connection": {
+            "value": {
+                "api_key": "aoai-api-key",
+                "api_base": "aoai-api-endpoint",
+                "api_version": "2023-07-01-preview",
+            }
+        }
+    }
+    with open(connections_file_path, mode="w", encoding="utf-8") as f:
+        json.dump(connections_template, f, ensure_ascii=False, indent=4)
+    _prompt_user_for_test_resources(connections_file_path)
+
+
 def create_tools_test_resource_template() -> None:
     working_dir = REPO_ROOT_DIR / "src" / "promptflow-tools"
     example_file_path = (working_dir / "connections.json.example").resolve().absolute()
@@ -46,4 +64,5 @@ def create_tools_test_resource_template() -> None:
 REGISTERED_TEST_RESOURCES_FUNCTIONS = {
     "promptflow-tracing": create_tracing_test_resource_template,
     "promptflow-tools": create_tools_test_resource_template,
+    "promptflow-evals": create_evals_test_resource_template,
 }
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
@@ -1,11 +1,14 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional, Dict, Callable
+from typing import Callable, Dict, Optional
+
 import pandas as pd
-from ._flow_run_wrapper import FlowRunWrapper
+
 from promptflow.client import PFClient
 
+from ._flow_run_wrapper import FlowRunWrapper
+
 
 def _calculate_mean(df) -> Dict[str, float]:
     mean_value = df.mean(numeric_only=True)
@@ -42,15 +45,15 @@ def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_
 
 
 def evaluate(
-        *,
-        evaluation_name: Optional[str] = None,
-        target: Optional[Callable] = None,
-        data: Optional[str] = None,
-        evaluators: Optional[Dict[str, Callable]] = None,
-        evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
-        tracking_uri: Optional[str] = None,
-        output_path: Optional[str] = None,
-        **kwargs,
+    *,
+    evaluation_name: Optional[str] = None,
+    target: Optional[Callable] = None,
+    data: Optional[str] = None,
+    evaluators: Optional[Dict[str, Callable]] = None,
+    evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
+    tracking_uri: Optional[str] = None,
+    output_path: Optional[str] = None,
+    **kwargs,
 ):
     """Evaluates target or data with built-in evaluation metrics
 
@@ -77,14 +80,17 @@ def evaluate(
     pf_client = PFClient()
 
     for evaluator_name, evaluator in evaluators.items():
-        evaluator_run_list.append(FlowRunWrapper(pf_client.run(
-            flow=evaluator,
-            column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
-            data=data,
-            stream=True
-        ),
-            prefix=evaluator_name
-        ))
+        evaluator_run_list.append(
+            FlowRunWrapper(
+                pf_client.run(
+                    flow=evaluator,
+                    column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
+                    data=data,
+                    stream=True,
+                ),
+                prefix=evaluator_name,
+            )
+        )
 
     result_df = None
     for eval_run in evaluator_run_list:
@@ -94,16 +100,12 @@ def evaluate(
             result_df = pd.concat(
                 [eval_run.get_result_df(all_results=True, exclude_inputs=True), result_df],
                 axis=1,
-                verify_integrity=True
+                verify_integrity=True,
             )
 
     input_data_df = pd.read_json(data, lines=True)
     input_data_df = input_data_df.rename(columns={col: f"inputs.{col}" for col in input_data_df.columns})
 
     row_results = pd.concat([input_data_df, result_df], axis=1, verify_integrity=True)
 
-    return {
-        "rows": row_results.to_dict("records"),
-        "metrics": _calculate_mean(result_df),
-        "traces": {}
-    }
+    return {"rows": row_results.to_dict("records"), "metrics": _calculate_mean(result_df), "traces": {}}
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_flow_run_wrapper.py b/src/promptflow-evals/promptflow/evals/evaluate/_flow_run_wrapper.py
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import time
+
 from promptflow.client import PFClient
 
 
@@ -16,17 +17,19 @@ def get_result_df(self, all_results=True, exclude_inputs=False):
         self._wait_for_completion()
         result_df = self.client.get_details(self.flow_run.name, all_results=all_results)
         if exclude_inputs:
-            result_df = result_df.drop(
-                columns=[col for col in result_df.columns if col.startswith("inputs.")]
-            )
+            result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
         result_df.rename(
-            columns={col: col.replace("outputs", self.prefix)
-                     for col in [col for col in result_df.columns if col.startswith("outputs.")]},
-            inplace=True)
+            columns={
+                col: col.replace("outputs", self.prefix)
+                for col in [col for col in result_df.columns if col.startswith("outputs.")]
+            },
+            inplace=True,
+        )
         return result_df
 
     def _wait_for_completion(self):
         from promptflow._sdk._constants import RunStatus
+
         while True:
             if self.run.status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED]:
                 break

diff --git a/src/promptflow-evals/pyproject.toml b/src/promptflow-evals/pyproject.toml
@@ -94,4 +94,4 @@ include_external_packages = "True"
 name = "Contract forbidden modules"
 type = "forbidden"
 source_modules = ["promptflow.evals"]
-forbidden_modules = []
+forbidden_modules = []
diff --git a/src/promptflow-evals/tests/conftest.py → src/promptflow-evals/tests/evals/__init__.py b/src/promptflow-evals/tests/conftest.py → src/promptflow-evals/tests/evals/__init__.py