Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add e2e test pipeline for promptflow-evals #2647

Merged
merged 13 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions .github/workflows/promptflow-evals-e2e-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: promptflow-evals-e2e-test

on:
schedule:
- cron: "40 10 * * *" # 2:40 PST every day
pull_request:
paths:
- src/promptflow-evals/**
- .github/workflows/promptflow-evals-e2e-test.yml
workflow_dispatch:

env:
IS_IN_CI_PIPELINE: "true"
WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
RECORD_DIRECTORY: ${{ github.workspace }}/src/promptflow-recording

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: snok/install-poetry@v1
- name: build
run: poetry build
working-directory: ${{ env.WORKING_DIRECTORY }}
- uses: actions/upload-artifact@v4
with:
name: promptflow-evals
path: ${{ env.WORKING_DIRECTORY }}/dist/promptflow_evals-*.whl

test:
needs: build
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
# TODO: Following up with PF team for the attribute error from 3.8 and 3.9.
python-version: ['3.10', '3.11']
#python-version: ['3.8', '3.9', '3.10', '3.11']
fail-fast: false
# snok/install-poetry need this to support Windows
defaults:
run:
shell: bash
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: set test mode
# Always run in replay mode for now until we figure out the test resource to run live mode
run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
#run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- uses: snok/install-poetry@v1
- uses: actions/download-artifact@v4
with:
name: promptflow-evals
path: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow packages in editable mode
run: |
poetry run pip install -e ../promptflow-core
poetry run pip install -e ../promptflow-devkit
poetry run pip install -e ../promptflow-tools
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install promptflow-evals from wheel
# wildcard expansion (*) does not work in Windows, so leverage python to find and install
run: poetry run pip install --pre $(python -c "import glob; print(glob.glob('promptflow_evals-*.whl')[0])")
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install test dependency group
run: poetry install --only test
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: install recording
run: poetry install
working-directory: ${{ env.RECORD_DIRECTORY }}
- name: generate end-to-end test config from secret
# TODO: replace with evals secret
run: echo '${{ secrets.PF_TRACING_E2E_TEST_CONFIG }}' >> connections.json
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: run e2e tests
run: poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
working-directory: ${{ env.WORKING_DIRECTORY }}
- name: upload coverage report
uses: actions/upload-artifact@v4
with:
name: report-${{ matrix.os }}-py${{ matrix.python-version }}
path: |
${{ env.WORKING_DIRECTORY }}/*.xml
${{ env.WORKING_DIRECTORY }}/htmlcov/

report:
needs: test
runs-on: ubuntu-latest
permissions:
checks: write
pull-requests: write
contents: read
issues: read
steps:
- uses: actions/download-artifact@v4
with:
path: artifacts
- uses: EnricoMi/publish-unit-test-result-action@v2
with:
check_name: promptflow-evals test result
comment_title: promptflow-evals test result
files: "artifacts/**/test-results.xml" # align with `--junit-xml` in pyproject.toml
- uses: irongut/CodeCoverageSummary@v1.3.0
with:
filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
badge: true
fail_below_min: false
format: markdown
hide_complexity: true
output: both
thresholds: 40 80
19 changes: 19 additions & 0 deletions scripts/dev-setup/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,24 @@ def create_tracing_test_resource_template() -> None:
_prompt_user_for_test_resources(connections_file_path)


def create_evals_test_resource_template() -> None:
working_dir = REPO_ROOT_DIR / "src" / "promptflow-evals"
connections_filename = "connections.json"
connections_file_path = (working_dir / connections_filename).resolve().absolute()
connections_template = {
"azure_open_ai_connection": {
"value": {
"api_key": "aoai-api-key",
"api_base": "aoai-api-endpoint",
"api_version": "2023-07-01-preview",
}
}
}
with open(connections_file_path, mode="w", encoding="utf-8") as f:
json.dump(connections_template, f, ensure_ascii=False, indent=4)
_prompt_user_for_test_resources(connections_file_path)


def create_tools_test_resource_template() -> None:
working_dir = REPO_ROOT_DIR / "src" / "promptflow-tools"
example_file_path = (working_dir / "connections.json.example").resolve().absolute()
Expand All @@ -46,4 +64,5 @@ def create_tools_test_resource_template() -> None:
REGISTERED_TEST_RESOURCES_FUNCTIONS = {
"promptflow-tracing": create_tracing_test_resource_template,
"promptflow-tools": create_tools_test_resource_template,
"promptflow-evals": create_evals_test_resource_template,
}
52 changes: 27 additions & 25 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing import Optional, Dict, Callable
from typing import Callable, Dict, Optional

import pandas as pd
from ._flow_run_wrapper import FlowRunWrapper

from promptflow.client import PFClient

from ._flow_run_wrapper import FlowRunWrapper


def _calculate_mean(df) -> Dict[str, float]:
mean_value = df.mean(numeric_only=True)
Expand Down Expand Up @@ -42,15 +45,15 @@ def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_


def evaluate(
*,
evaluation_name: Optional[str] = None,
target: Optional[Callable] = None,
data: Optional[str] = None,
evaluators: Optional[Dict[str, Callable]] = None,
evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
tracking_uri: Optional[str] = None,
output_path: Optional[str] = None,
**kwargs,
*,
evaluation_name: Optional[str] = None,
target: Optional[Callable] = None,
data: Optional[str] = None,
evaluators: Optional[Dict[str, Callable]] = None,
evaluator_config: Optional[Dict[str, Dict[str, str]]] = {},
tracking_uri: Optional[str] = None,
output_path: Optional[str] = None,
**kwargs,
):
"""Evaluates target or data with built-in evaluation metrics

Expand All @@ -77,14 +80,17 @@ def evaluate(
pf_client = PFClient()

for evaluator_name, evaluator in evaluators.items():
evaluator_run_list.append(FlowRunWrapper(pf_client.run(
flow=evaluator,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data,
stream=True
),
prefix=evaluator_name
))
evaluator_run_list.append(
FlowRunWrapper(
pf_client.run(
flow=evaluator,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data,
stream=True,
),
prefix=evaluator_name,
)
)

result_df = None
for eval_run in evaluator_run_list:
Expand All @@ -94,16 +100,12 @@ def evaluate(
result_df = pd.concat(
[eval_run.get_result_df(all_results=True, exclude_inputs=True), result_df],
axis=1,
verify_integrity=True
verify_integrity=True,
)

input_data_df = pd.read_json(data, lines=True)
input_data_df = input_data_df.rename(columns={col: f"inputs.{col}" for col in input_data_df.columns})

row_results = pd.concat([input_data_df, result_df], axis=1, verify_integrity=True)

return {
"rows": row_results.to_dict("records"),
"metrics": _calculate_mean(result_df),
"traces": {}
}
return {"rows": row_results.to_dict("records"), "metrics": _calculate_mean(result_df), "traces": {}}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import time

from promptflow.client import PFClient


Expand All @@ -16,17 +17,19 @@ def get_result_df(self, all_results=True, exclude_inputs=False):
self._wait_for_completion()
result_df = self.client.get_details(self.flow_run.name, all_results=all_results)
if exclude_inputs:
result_df = result_df.drop(
columns=[col for col in result_df.columns if col.startswith("inputs.")]
)
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
result_df.rename(
columns={col: col.replace("outputs", self.prefix)
for col in [col for col in result_df.columns if col.startswith("outputs.")]},
inplace=True)
columns={
col: col.replace("outputs", self.prefix)
for col in [col for col in result_df.columns if col.startswith("outputs.")]
},
inplace=True,
)
return result_df

def _wait_for_completion(self):
from promptflow._sdk._constants import RunStatus

while True:
if self.run.status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED]:
break
Expand Down
2 changes: 1 addition & 1 deletion src/promptflow-evals/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,4 @@ include_external_packages = "True"
name = "Contract forbidden modules"
type = "forbidden"
source_modules = ["promptflow.evals"]
forbidden_modules = []
forbidden_modules = []
Loading
Loading