From 74262a1739e45723c803ad204cc907e65e19e33a Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Fri, 12 Apr 2024 22:25:40 -0700 Subject: [PATCH 1/5] Fix mypy linting and fix logging. --- .../promptflow/_utils/logger_utils.py | 15 +++++++- .../promptflow/core/_serving/flow_invoker.py | 9 +++-- .../entities/_flows/_flow_context_resolver.py | 16 +++++---- .../promptflow/_sdk/entities/_flows/base.py | 6 +++- .../promptflow/evals/evaluate/_evaluate.py | 34 ++++++++++-------- .../evals/evaluators/chat/__init__.py | 8 ++--- .../evaluators/coherence/flow/parse_score.py | 4 ++- .../flow/evaluate_with_rai_service.py | 4 +-- .../evaluators/content_safety/flow/utils.py | 4 ++- .../content_safety/hate_unfairness.py | 7 ++-- .../evaluators/content_safety/self_harm.py | 7 ++-- .../evals/evaluators/content_safety/sexual.py | 7 ++-- .../evaluators/content_safety/violence.py | 7 ++-- .../evals/evaluators/f1_score/__init__.py | 9 +++-- .../evaluators/f1_score/flow/f1_score.py | 2 +- .../evals/evaluators/fluency/__init__.py | 9 +++-- .../evaluators/fluency/flow/parse_score.py | 4 ++- .../evals/evaluators/groundedness/__init__.py | 9 +++-- .../groundedness/flow/parse_score.py | 4 ++- .../evals/evaluators/qa/__init__.py | 19 ++++++---- .../evals/evaluators/relevance/__init__.py | 9 +++-- .../evaluators/relevance/flow/parse_score.py | 4 ++- .../evals/evaluators/similarity/__init__.py | 9 +++-- .../evaluators/similarity/flow/parse_score.py | 4 ++- .../promptflow/evals/synthetic/qa.py | 2 +- src/promptflow-evals/pyproject.toml | 9 +++++ .../tests/unittests/test_f1_evaluator.py | 36 +++++++++++++++++++ 27 files changed, 193 insertions(+), 64 deletions(-) create mode 100644 src/promptflow-evals/tests/unittests/test_f1_evaluator.py diff --git a/src/promptflow-core/promptflow/_utils/logger_utils.py b/src/promptflow-core/promptflow/_utils/logger_utils.py index fca4c9c0a7e..7eede471376 100644 --- a/src/promptflow-core/promptflow/_utils/logger_utils.py +++ b/src/promptflow-core/promptflow/_utils/logger_utils.py @@ -182,7 +182,7 @@ def get_pf_logging_level(default=logging.INFO): def get_logger(name: str) -> logging.Logger: """Get logger used during execution.""" - logger = logging.Logger(name) + logger = logging.getLogger(name) logger.setLevel(get_pf_logging_level()) logger.addHandler(FileHandlerConcurrentWrapper()) stdout_handler = logging.StreamHandler(sys.stdout) @@ -207,6 +207,19 @@ def get_logger(name: str) -> logging.Logger: service_logger = get_logger("execution.service") +def update_logger_levels(log_level: Optional[str] = None) -> None: + """ + Update the logger levels. + + :param log_level: The new logging level. If it is None, + logging level will be taken from + using get_pf_logging_level. + :type log_level: Optional[str] + """ + for log in [flow_logger, bulk_logger, logger, service_logger]: + log.setLevel(log_level or get_pf_logging_level()) + + logger_contexts = [] diff --git a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py index 09a0c6ce659..c7a85cee9b4 100644 --- a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py +++ b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import dataclasses +import logging import os from pathlib import Path from typing import Callable, Union @@ -59,7 +60,10 @@ def __init__( init_kwargs: dict = None, **kwargs, ): - self.logger = kwargs.get("logger", LoggerFactory.get_logger("flowinvoker")) + self.logger = kwargs.get( + "logger", + LoggerFactory.get_logger("flowinvoker", + verbosity=kwargs.get('log_level') or logging.INFO)) self._init_kwargs = init_kwargs or {} self.logger.debug(f"Init flow invoker with init kwargs: {self._init_kwargs}") # TODO: avoid to use private attribute after we finalize the inheritance @@ -123,7 +127,8 @@ def _init_connections(self, connection_provider): connection_names=self.flow.get_connection_names( environment_variables_overrides=os.environ, ), - provider=ConnectionProvider.init_from_provider_config(connection_provider, credential=self._credential), + provider=ConnectionProvider.init_from_provider_config( + connection_provider, credential=self._credential), connections_to_ignore=connections_to_ignore, # fetch connections with name override connections_to_add=list(self.connections_name_overrides.values()), diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py index 59e347fbb6a..7c0c4be3f66 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/_flow_context_resolver.py @@ -6,7 +6,7 @@ from functools import lru_cache from os import PathLike from pathlib import Path -from typing import Dict, Union +from typing import Dict, Optional, Union from promptflow._sdk._configuration import Configuration from promptflow._sdk._constants import NODES @@ -37,19 +37,20 @@ def __init__(self, flow_path: PathLike): @classmethod @lru_cache - def resolve(cls, flow: Flow) -> "FlowInvoker": + def resolve(cls, flow: Flow, log_level: Optional[int] = None) -> "FlowInvoker": """Resolve flow to flow invoker.""" resolver = cls(flow_path=flow.path) resolver._resolve(flow_context=flow.context) - return resolver._create_invoker(flow_context=flow.context) + return resolver._create_invoker(flow_context=flow.context, log_level=log_level) @classmethod @lru_cache - def resolve_async_invoker(cls, flow: Flow) -> "AsyncFlowInvoker": + def resolve_async_invoker(cls, flow: Flow, log_level: Optional[int] = None) -> "AsyncFlowInvoker": """Resolve flow to flow invoker.""" resolver = cls(flow_path=flow.path) resolver._resolve(flow_context=flow.context) - return resolver._create_invoker(flow_context=flow.context, is_async_call=True) + return resolver._create_invoker(flow_context=flow.context, is_async_call=True, + log_level=log_level) def _resolve(self, flow_context: FlowContext): """Resolve flow context.""" @@ -113,7 +114,8 @@ def _resolve_connection_objs(self, flow_context: FlowContext): return connections def _create_invoker( - self, flow_context: FlowContext, is_async_call=False + self, flow_context: FlowContext, is_async_call=False, + log_level: Optional[int] = None ) -> Union["FlowInvoker", "AsyncFlowInvoker"]: from promptflow.core._serving.flow_invoker import AsyncFlowInvoker, FlowInvoker @@ -132,6 +134,7 @@ def _create_invoker( flow=resolved_flow, connections=connections, streaming=flow_context.streaming, + log_level=log_level, ) else: return FlowInvoker( @@ -139,4 +142,5 @@ def _create_invoker( connections=connections, streaming=flow_context.streaming, connection_provider=Configuration.get_instance().get_connection_provider(), + log_level=log_level, ) diff --git a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py index 900b4ea7497..13214cfc3c7 100644 --- a/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py +++ b/src/promptflow-devkit/promptflow/_sdk/entities/_flows/base.py @@ -9,6 +9,7 @@ from promptflow._constants import DEFAULT_ENCODING, FLOW_FILE_SUFFIX from promptflow._sdk.entities._validation import SchemaValidatableMixin from promptflow._utils.flow_utils import is_flex_flow, is_prompty_flow, resolve_flow_path +from promptflow._utils.logger_utils import update_logger_levels from promptflow._utils.yaml_utils import load_yaml_string from promptflow.core._flow import AbstractFlowBase from promptflow.exceptions import UserErrorException @@ -145,6 +146,7 @@ def __init__( **kwargs, ): self.variant = kwargs.pop("variant", None) or {} + self._log_level = kwargs.pop("log_level", None) super().__init__(data=dag, code=code, path=path, **kwargs) @property @@ -236,6 +238,8 @@ def __call__(self, *args, **kwargs): if args: raise UserErrorException("Flow can only be called with keyword arguments.") + if self._log_level: + update_logger_levels(self._log_level) result = self.invoke(inputs=kwargs) return result.output @@ -243,7 +247,7 @@ def invoke(self, inputs: dict) -> "LineResult": """Invoke a flow and get a LineResult object.""" from promptflow._sdk.entities._flows._flow_context_resolver import FlowContextResolver - invoker = FlowContextResolver.resolve(flow=self) + invoker = FlowContextResolver.resolve(flow=self, log_level=self._log_level) result = invoker._invoke( data=inputs, ) diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py index ac3d53f1dd8..548028d7f98 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py @@ -3,7 +3,7 @@ # --------------------------------------------------------- import inspect from types import FunctionType -from typing import Callable, Dict, Optional +from typing import Callable, Dict, Optional, Union, cast import pandas as pd @@ -103,23 +103,27 @@ def evaluate( code_client = CodeClient() evaluator_info = {} - - for evaluator_name, evaluator in evaluators.items(): - if isinstance(evaluator, FunctionType): - evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}}) - else: - evaluator_info.update({evaluator_name: {"client": code_client, "evaluator": evaluator}}) - - evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run( - flow=evaluator, - column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)), - data=data, - stream=True, - ) + if evaluator_config is None: + evaluator_config = {} + + if evaluators: + for evaluator_name, evaluator in evaluators.items(): + if isinstance(evaluator, FunctionType): + evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}}) + else: + evaluator_info.update({evaluator_name: {"client": code_client, "evaluator": evaluator}}) + + evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run( + flow=evaluator, + column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)), + data=data, + stream=True, + ) evaluators_result_df = None for evaluator_name, evaluator_info in evaluator_info.items(): - evaluator_result_df = evaluator_info["client"].get_details(evaluator_info["run"], all_results=True) + evaluator_result_df = cast( + Union[PFClient, CodeClient], evaluator_info["client"]).get_details(evaluator_info["run"], all_results=True) # drop input columns evaluator_result_df = evaluator_result_df.drop( diff --git a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py index 0c7b8ab2b72..de38dc6aa3b 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py @@ -6,7 +6,7 @@ from promptflow.connections import AzureOpenAIConnection from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator -from typing import List, Dict +from typing import Any, Dict, List from concurrent.futures import ThreadPoolExecutor, as_completed import json import logging @@ -170,8 +170,8 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator): return {} def _aggregate_results(self, per_turn_results: List[Dict]): - scores = {} - reasons = {} + scores: Dict[str, Any] = {} + reasons: Dict[str, Any] = {} for turn in per_turn_results: for metric, value in turn.items(): @@ -184,7 +184,7 @@ def _aggregate_results(self, per_turn_results: List[Dict]): scores[metric] = [] scores[metric].append(value) - aggregated = {} + aggregated: Dict[str, Any] = {} evaluation_per_turn = {} for metric, values in scores.items(): diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/coherence/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py index 0b93be5ce38..e2433e610b0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/evaluate_with_rai_service.py @@ -1,7 +1,7 @@ from promptflow.core import tool import time from constants import RAIService, Tasks -from typing import List +from typing import Any, Dict, List import numpy as np import re from utils import get_harm_severity_level @@ -68,7 +68,7 @@ def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredentia time.sleep(sleep_time) -def parse_response(batch_response: List[dict], metric_name: str) -> List[List[dict]]: +def parse_response(batch_response: List[dict], metric_name: str) -> Dict[str, Any]: # Fix the metric name if it's "hate_fairness" # Eventually we will remove this fix once the RAI service is updated key = metric_name diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py index f9a035158bf..ff4fb69df44 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/flow/utils.py @@ -1,8 +1,10 @@ import constants import numpy as np +from typing import Union -def get_harm_severity_level(harm_score: int) -> str: + +def get_harm_severity_level(harm_score: int) -> Union[str, float]: HARM_SEVERITY_LEVEL_MAPPING = {constants.HarmSeverityLevel.Safe: [0, 1], constants.HarmSeverityLevel.Low: [2, 3], constants.HarmSeverityLevel.Medium: [4, 5], diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py index 6cb9467533f..e567cc0c408 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/hate_unfairness.py @@ -6,7 +6,8 @@ class HateUnfairnessEvaluator: - def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None): + def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None, + log_level: Optional[int] = None): """ Initialize an evaluator for hate unfairness score. @@ -15,6 +16,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -30,7 +33,7 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates hate unfairness content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py index bb01f05dd55..dd5ae0c6c33 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/self_harm.py @@ -6,7 +6,8 @@ class SelfHarmEvaluator: - def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None): + def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None, + log_level: Optional[int] = None): """ Initialize an evaluator for self harm score. @@ -15,6 +16,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -30,7 +33,7 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates self harm content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py index fb0d3f79d71..5d83826d7c2 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/sexual.py @@ -6,7 +6,8 @@ class SexualEvaluator: - def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None): + def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None, + log_level: Optional[int] = None): """ Initialize an evaluator for sexual score. @@ -15,6 +16,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -30,7 +33,7 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates sexual content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py index 47382d2c330..56a8724e582 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/content_safety/violence.py @@ -6,7 +6,8 @@ class ViolenceEvaluator: - def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None): + def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = None, + log_level: Optional[int] = None): """ Initialize an evaluator for violence score. @@ -15,6 +16,8 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = :type project_scope: dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -30,7 +33,7 @@ def __init__(self, project_scope: dict, credential: Optional[TokenCredential] = # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, question: str, answer: str, **kwargs): """Evaluates violence content. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py index 2372e98cc72..d2baaf281c9 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/__init__.py @@ -4,15 +4,20 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.client import load_flow from pathlib import Path class F1ScoreEvaluator: - def __init__(self): + def __init__(self, log_level: Optional[int] = None) -> None: """ Initialize an evaluator for calculating F1 score. + :param log_level: The logging level. + :type log_level: Optional[int] + **Usage** .. code-block:: python @@ -27,7 +32,7 @@ def __init__(self): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) def __call__(self, *, answer: str, ground_truth: str, **kwargs): """Evaluate F1 score. diff --git a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py index 08c6ad25677..5eb4ee5e87e 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/f1_score/flow/f1_score.py @@ -3,7 +3,7 @@ @tool -def compute_f1_score(answer: str, ground_truth: str) -> str: +def compute_f1_score(answer: str, ground_truth: str) -> float: import string import re diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py index 8aeab64e6b7..b293c8a5232 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/fluency/__init__.py @@ -4,13 +4,16 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.client import load_flow from promptflow.entities import AzureOpenAIConnection from pathlib import Path class FluencyEvaluator: - def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): + def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -18,6 +21,8 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): :type model_config: AzureOpenAIConnection :param deployment_name: Deployment to be used which has Azure OpenAI model. :type deployment_name: AzureOpenAIConnection + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -32,7 +37,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection self._flow.context.connections = { diff --git a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/fluency/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py index 72936f0a841..55542d69bbb 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/__init__.py @@ -4,13 +4,16 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.client import load_flow from promptflow.entities import AzureOpenAIConnection from pathlib import Path class GroundednessEvaluator: - def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): + def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -18,6 +21,8 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): :type model_config: AzureOpenAIConnection :param deployment_name: Deployment to be used which has Azure OpenAI model. :type deployment_name: AzureOpenAIConnection + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -33,7 +38,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection self._flow.context.connections = { diff --git a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/groundedness/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py index 832b58a389b..c8743b74f5f 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/qa/__init__.py @@ -4,13 +4,16 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.entities import AzureOpenAIConnection from promptflow.evals.evaluators import GroundednessEvaluator, RelevanceEvaluator, \ CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator class QAEvaluator: - def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): + def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -20,6 +23,8 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): :type deployment_name: AzureOpenAIConnection :return: A function that evaluates and generates metrics for "question-answering" scenario. :rtype: function + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -34,12 +39,12 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): ) """ self._evaluators = [ - GroundednessEvaluator(model_config, deployment_name=deployment_name), - RelevanceEvaluator(model_config, deployment_name=deployment_name), - CoherenceEvaluator(model_config, deployment_name=deployment_name), - FluencyEvaluator(model_config, deployment_name=deployment_name), - SimilarityEvaluator(model_config, deployment_name=deployment_name), - F1ScoreEvaluator(), + GroundednessEvaluator(model_config, deployment_name=deployment_name, log_level=log_level), + RelevanceEvaluator(model_config, deployment_name=deployment_name, log_level=log_level), + CoherenceEvaluator(model_config, deployment_name=deployment_name, log_level=log_level), + FluencyEvaluator(model_config, deployment_name=deployment_name, log_level=log_level), + SimilarityEvaluator(model_config, deployment_name=deployment_name, log_level=log_level), + F1ScoreEvaluator(log_level=log_level), ] def __call__(self, *, question: str, answer: str, context: str, ground_truth: str, **kwargs): diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py index cfa79d71b90..014d9578163 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/relevance/__init__.py @@ -4,13 +4,16 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.client import load_flow from promptflow.entities import AzureOpenAIConnection from pathlib import Path class RelevanceEvaluator: - def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): + def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -18,6 +21,8 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): :type model_config: AzureOpenAIConnection :param deployment_name: Deployment to be used which has Azure OpenAI model. :type deployment_name: AzureOpenAIConnection + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -34,7 +39,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection self._flow.context.connections = { diff --git a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/relevance/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py index 41d72ffdb60..bf441edd661 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/similarity/__init__.py @@ -4,13 +4,16 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from promptflow.client import load_flow from promptflow.entities import AzureOpenAIConnection from pathlib import Path class SimilarityEvaluator: - def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): + def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. @@ -18,6 +21,8 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): :type model_config: AzureOpenAIConnection :param deployment_name: Deployment to be used which has Azure OpenAI model. :type deployment_name: AzureOpenAIConnection + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -33,7 +38,7 @@ def __init__(self, model_config: AzureOpenAIConnection, deployment_name: str): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection self._flow.context.connections = { diff --git a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py index e4157b4d22c..ecc0624fe3a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/similarity/flow/parse_score.py @@ -2,9 +2,11 @@ import numpy as np import re +from typing import Optional + @tool -def parse_score(llm_output: str = None): +def parse_score(llm_output: Optional[str] = None): score = np.nan if llm_output: match = re.search(r'\d', llm_output) diff --git a/src/promptflow-evals/promptflow/evals/synthetic/qa.py b/src/promptflow-evals/promptflow/evals/synthetic/qa.py index e56a2150d80..0fb91afa915 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/qa.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/qa.py @@ -31,7 +31,7 @@ if openai_version >= pkg_resources.parse_version("1.0.0"): _RETRY_ERRORS: Tuple = (openai.APIConnectionError, openai.APIError, openai.APIStatusError) else: - _RETRY_ERRORS: Tuple = ( + _RETRY_ERRORS = ( openai.error.ServiceUnavailableError, # pylint: disable=no-member openai.error.APIError, # pylint: disable=no-member openai.error.RateLimitError, # pylint: disable=no-member diff --git a/src/promptflow-evals/pyproject.toml b/src/promptflow-evals/pyproject.toml index d482a648868..c7c0b98e638 100644 --- a/src/promptflow-evals/pyproject.toml +++ b/src/promptflow-evals/pyproject.toml @@ -95,3 +95,12 @@ name = "Contract forbidden modules" type = "forbidden" source_modules = ["promptflow.evals"] forbidden_modules = [] + +[tool.mypy] +exclude = [ + "tests/evals" +] +warn_unused_configs = true +follow_imports = "skip" +ignore_missing_imports = true +follow_imports_for_stubs = false \ No newline at end of file diff --git a/src/promptflow-evals/tests/unittests/test_f1_evaluator.py b/src/promptflow-evals/tests/unittests/test_f1_evaluator.py new file mode 100644 index 00000000000..ea4e9b01161 --- /dev/null +++ b/src/promptflow-evals/tests/unittests/test_f1_evaluator.py @@ -0,0 +1,36 @@ +''' +Created on Apr 12, 2024 + +@author: nirovins +''' +import pytest +import logging + +from unittest.mock import patch + +from promptflow.evals.evaluators.f1_score import F1ScoreEvaluator + + +class TestF1ScoreEvaluator: + + @pytest.mark.parametrize( + "log_level,expected", + [ + (logging.INFO, set(['flowinvoker', 'execution.flow'])), + (logging.WARNING, set()), + ]) + def test_f1_scre_evaluator_logs(self, caplog, log_level, expected): + """Test logging with f1 score_evaluator.""" + def mock_get(name: str, verbosity: int = logging.INFO, target_stdout: bool = False): + logger = logging.getLogger(name) + logger.setLevel(verbosity) + return logger + + with patch('promptflow._utils.logger_utils.LoggerFactory') as mock_factory: + mock_factory.get_logger = mock_get + F1ScoreEvaluator(log_level=log_level)( + answer='June is the coldest summer month.', + ground_truth='January is the coldest winter month.' + ) + log_called = {lg.name for lg in caplog.records} + assert {'flowinvoker', 'execution.flow'}.intersection(log_called) == expected From 6f1dd1349d372e04ac9321d9bd53d99a8850000d Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Mon, 15 Apr 2024 12:28:28 -0700 Subject: [PATCH 2/5] Fixes --- .../promptflow/core/_serving/flow_invoker.py | 5 ++--- src/promptflow-devkit/tests/sdk_cli_test/conftest.py | 2 +- ...test_f1_evaluator.py => test_evaluator_logging.py} | 11 +++++------ .../flows/flow_with_user_output/flow.dag.yaml | 2 +- 4 files changed, 9 insertions(+), 11 deletions(-) rename src/promptflow-evals/tests/unittests/{test_f1_evaluator.py => test_evaluator_logging.py} (76%) diff --git a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py index c7a85cee9b4..8a17426c164 100644 --- a/src/promptflow-core/promptflow/core/_serving/flow_invoker.py +++ b/src/promptflow-core/promptflow/core/_serving/flow_invoker.py @@ -2,14 +2,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import dataclasses -import logging import os from pathlib import Path from typing import Callable, Union from promptflow._utils.dataclass_serializer import convert_eager_flow_output_to_dict from promptflow._utils.flow_utils import dump_flow_result, is_executable_chat_flow -from promptflow._utils.logger_utils import LoggerFactory +from promptflow._utils.logger_utils import LoggerFactory, get_pf_logging_level from promptflow._utils.multimedia_utils import MultimediaProcessor from promptflow.core._connection import _Connection from promptflow.core._connection_provider._connection_provider import ConnectionProvider @@ -63,7 +62,7 @@ def __init__( self.logger = kwargs.get( "logger", LoggerFactory.get_logger("flowinvoker", - verbosity=kwargs.get('log_level') or logging.INFO)) + verbosity=kwargs.get('log_level') or get_pf_logging_level())) self._init_kwargs = init_kwargs or {} self.logger.debug(f"Init flow invoker with init kwargs: {self._init_kwargs}") # TODO: avoid to use private attribute after we finalize the inheritance diff --git a/src/promptflow-devkit/tests/sdk_cli_test/conftest.py b/src/promptflow-devkit/tests/sdk_cli_test/conftest.py index 3c536329b93..e85a9f7f4f3 100644 --- a/src/promptflow-devkit/tests/sdk_cli_test/conftest.py +++ b/src/promptflow-devkit/tests/sdk_cli_test/conftest.py @@ -7,7 +7,7 @@ import pytest from _constants import CONNECTION_FILE, PROMPTFLOW_ROOT -from mock import mock +from unittest import mock from pytest_mock import MockerFixture from sqlalchemy import create_engine diff --git a/src/promptflow-evals/tests/unittests/test_f1_evaluator.py b/src/promptflow-evals/tests/unittests/test_evaluator_logging.py similarity index 76% rename from src/promptflow-evals/tests/unittests/test_f1_evaluator.py rename to src/promptflow-evals/tests/unittests/test_evaluator_logging.py index ea4e9b01161..68147dfff9d 100644 --- a/src/promptflow-evals/tests/unittests/test_f1_evaluator.py +++ b/src/promptflow-evals/tests/unittests/test_evaluator_logging.py @@ -1,8 +1,7 @@ -''' -Created on Apr 12, 2024 +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- -@author: nirovins -''' import pytest import logging @@ -11,7 +10,7 @@ from promptflow.evals.evaluators.f1_score import F1ScoreEvaluator -class TestF1ScoreEvaluator: +class TestEvaluatorLogging: @pytest.mark.parametrize( "log_level,expected", @@ -19,7 +18,7 @@ class TestF1ScoreEvaluator: (logging.INFO, set(['flowinvoker', 'execution.flow'])), (logging.WARNING, set()), ]) - def test_f1_scre_evaluator_logs(self, caplog, log_level, expected): + def test_f1_score_evaluator_logs(self, caplog, log_level, expected): """Test logging with f1 score_evaluator.""" def mock_get(name: str, verbosity: int = logging.INFO, target_stdout: bool = False): logger = logging.getLogger(name) diff --git a/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml b/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml index 8388a9d7a44..e52427ad235 100644 --- a/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml +++ b/src/promptflow/tests/test_configs/flows/flow_with_user_output/flow.dag.yaml @@ -4,7 +4,7 @@ inputs: outputs: output: type: string - reference: ${print_val.output.value} + reference: ${print_val.output} nodes: - name: print_val type: python From 9b00dc06f5d339e6201cea54ea32b95dbe8ee0e4 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Mon, 15 Apr 2024 13:44:25 -0700 Subject: [PATCH 3/5] Fix unit tests --- src/promptflow-core/promptflow/_utils/logger_utils.py | 2 +- .../tests/unittests/test_evaluator_logging.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/promptflow-core/promptflow/_utils/logger_utils.py b/src/promptflow-core/promptflow/_utils/logger_utils.py index 7eede471376..d8d755ae9f3 100644 --- a/src/promptflow-core/promptflow/_utils/logger_utils.py +++ b/src/promptflow-core/promptflow/_utils/logger_utils.py @@ -182,7 +182,7 @@ def get_pf_logging_level(default=logging.INFO): def get_logger(name: str) -> logging.Logger: """Get logger used during execution.""" - logger = logging.getLogger(name) + logger = logging.Logger(name) logger.setLevel(get_pf_logging_level()) logger.addHandler(FileHandlerConcurrentWrapper()) stdout_handler = logging.StreamHandler(sys.stdout) diff --git a/src/promptflow-evals/tests/unittests/test_evaluator_logging.py b/src/promptflow-evals/tests/unittests/test_evaluator_logging.py index 68147dfff9d..995278e22d8 100644 --- a/src/promptflow-evals/tests/unittests/test_evaluator_logging.py +++ b/src/promptflow-evals/tests/unittests/test_evaluator_logging.py @@ -15,11 +15,13 @@ class TestEvaluatorLogging: @pytest.mark.parametrize( "log_level,expected", [ - (logging.INFO, set(['flowinvoker', 'execution.flow'])), + (logging.INFO, set(['flowinvoker'])), (logging.WARNING, set()), ]) def test_f1_score_evaluator_logs(self, caplog, log_level, expected): """Test logging with f1 score_evaluator.""" + # Note we are not checking for 'execution.flow' as caplog + # cannot catch it as this logger does not have a root logger as a parent. def mock_get(name: str, verbosity: int = logging.INFO, target_stdout: bool = False): logger = logging.getLogger(name) logger.setLevel(verbosity) @@ -32,4 +34,4 @@ def mock_get(name: str, verbosity: int = logging.INFO, target_stdout: bool = Fal ground_truth='January is the coldest winter month.' ) log_called = {lg.name for lg in caplog.records} - assert {'flowinvoker', 'execution.flow'}.intersection(log_called) == expected + assert {'flowinvoker'}.intersection(log_called) == expected From c7f08f75f3385b39c4e185a4f6b279f9bfc32029 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Tue, 16 Apr 2024 09:34:02 -0700 Subject: [PATCH 4/5] Ignore pyproject files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 74df3834b44..641912b455b 100644 --- a/.gitignore +++ b/.gitignore @@ -191,3 +191,7 @@ config.json poetry.lock # promptflow subpackages __init__ src/promptflow-*/promptflow/__init__.py + +# Eclipse project files +**/.project +**/.pydevproject \ No newline at end of file From e04f9a077f41350950bbe8f3ecb842e272050190 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Tue, 16 Apr 2024 10:02:59 -0700 Subject: [PATCH 5/5] Fix coherence evauator --- .../promptflow/evals/evaluators/coherence/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py index 2fb81de63b0..8818b7656a1 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/coherence/__init__.py @@ -4,6 +4,8 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) # type: ignore +from typing import Optional + from pathlib import Path from promptflow.client import load_flow @@ -12,12 +14,15 @@ class CoherenceEvaluator: - def __init__(self, model_config: AzureOpenAIModelConfiguration): + def __init__(self, model_config: AzureOpenAIModelConfiguration, + log_level: Optional[int] = None): """ Initialize an evaluator configured for a specific Azure OpenAI model. :param model_config: Configuration for the Azure OpenAI model. :type model_config: AzureOpenAIModelConfiguration + :param log_level: The logging level. + :type log_level: Optional[int] **Usage** @@ -32,7 +37,7 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): # Load the flow as function current_dir = Path(__file__).resolve().parent flow_dir = current_dir / "flow" - self._flow = load_flow(source=flow_dir) + self._flow = load_flow(source=flow_dir, log_level=log_level) # Override the connection connection = convert_model_configuration_to_connection(model_config)