Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Math Evaluators #3719

Merged
merged 9 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,12 @@
"rerank",
"reranker",
"rcfile",
"pylintrc"
"pylintrc",
"gleu",
"Gleu",
"GLEU",
"fmeasure",
"punkt"
],
"flagWords": [
"Prompt Flow"
Expand Down
28 changes: 28 additions & 0 deletions src/promptflow-evals/promptflow/evals/_common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,20 @@
from . import constants
except ImportError:
import constants

from typing import List, cast

import nltk
import numpy as np

try:
from nltk.tokenize.nist import NISTTokenizer
except LookupError:
nltk.download("perluniprops")
nltk.download("punkt")
nltk.download("punkt_tab")
from nltk.tokenize.nist import NISTTokenizer
ninghu marked this conversation as resolved.
Show resolved Hide resolved


def get_harm_severity_level(harm_score: int) -> str:
"""Generate harm severity level based on harm score.
Expand All @@ -28,3 +40,19 @@ def get_harm_severity_level(harm_score: int) -> str:
if harm_score_range[0] <= harm_score <= harm_score_range[1]:
return harm_level.value
return np.nan


def nltk_tokenize(text: str) -> List[str]:
ninghu marked this conversation as resolved.
Show resolved Hide resolved
"""Tokenize the input text using the NLTK tokenizer."""

is_latin_or_numeric = all(
("\u0020" <= c <= "\u007E") # Basic Latin
or ("\u00A0" <= c <= "\u00FF") # Latin-1 Supplement
or ("0" <= c <= "9") # Digits
for c in text
)

if is_latin_or_numeric:
return cast(List[str], nltk.word_tokenize(text))

return list(NISTTokenizer().international_tokenize(text))
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._bleu import BleuScoreEvaluator
from ._chat import ChatEvaluator
from ._coherence import CoherenceEvaluator
from ._content_safety import (
Expand All @@ -14,10 +15,13 @@
)
from ._f1_score import F1ScoreEvaluator
from ._fluency import FluencyEvaluator
from ._gleu import GleuScoreEvaluator
from ._groundedness import GroundednessEvaluator
from ._meteor import MeteorScoreEvaluator
from ._protected_material import ProtectedMaterialEvaluator
from ._qa import QAEvaluator
from ._relevance import RelevanceEvaluator
from ._rouge import RougeScoreEvaluator, RougeType
from ._similarity import SimilarityEvaluator

__all__ = [
Expand All @@ -35,5 +39,10 @@
"HateUnfairnessEvaluator",
"ContentSafetyEvaluator",
"ContentSafetyChatEvaluator",
"BleuScoreEvaluator",
"GleuScoreEvaluator",
"MeteorScoreEvaluator",
"RougeScoreEvaluator",
"RougeType",
"ProtectedMaterialEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._bleu import BleuScoreEvaluator

__all__ = [
"BleuScoreEvaluator",
]
72 changes: 72 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/_bleu/_bleu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu

from promptflow._utils.async_utils import async_run_allowing_running_loop
from promptflow.evals._common.utils import nltk_tokenize


class _AsyncBleuScoreEvaluator:
def __init__(self):
pass

async def __call__(self, *, answer: str, ground_truth: str, **kwargs):
reference_tokens = nltk_tokenize(ground_truth)
hypothesis_tokens = nltk_tokenize(answer)

smoothing_function = SmoothingFunction().method4
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

return {
"bleu_score": score,
}


class BleuScoreEvaluator:
"""
Evaluator that computes the BLEU Score between two strings.

BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
better quality.

**Usage**

.. code-block:: python

eval_fn = BleuScoreEvaluator()
result = eval_fn(
answer="Tokyo is the capital of Japan.",
ground_truth="The capital of Japan is Tokyo.")

**Output format**

.. code-block:: python

{
"bleu_score": 0.22
}
"""

def __init__(self):
self._async_evaluator = _AsyncBleuScoreEvaluator()

def __call__(self, *, answer: str, ground_truth: str, **kwargs):
"""
Evaluate the BLEU score between the answer and the ground truth.

:keyword answer: The answer to be evaluated.
:paramtype answer: str
:keyword ground_truth: The ground truth to be compared against.
:paramtype ground_truth: str
:return: The BLEU score.
:rtype: dict
"""
return async_run_allowing_running_loop(
self._async_evaluator, answer=answer, ground_truth=ground_truth, **kwargs
)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._gleu import GleuScoreEvaluator

__all__ = [
"GleuScoreEvaluator",
]
71 changes: 71 additions & 0 deletions src/promptflow-evals/promptflow/evals/evaluators/_gleu/_gleu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from nltk.translate.gleu_score import sentence_gleu

from promptflow._utils.async_utils import async_run_allowing_running_loop
from promptflow.evals._common.utils import nltk_tokenize


class _AsyncGleuScoreEvaluator:
def __init__(self):
pass

async def __call__(self, *, ground_truth: str, answer: str, **kwargs):
reference_tokens = nltk_tokenize(ground_truth)
hypothesis_tokens = nltk_tokenize(answer)

score = sentence_gleu([reference_tokens], hypothesis_tokens)

return {
"gleu_score": score,
}


class GleuScoreEvaluator:
"""
Evaluator that computes the BLEU Score between two strings.

The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
use cases such as machine translation, text summarization, and text generation.

**Usage**

.. code-block:: python

eval_fn = GleuScoreEvaluator()
result = eval_fn(
answer="Tokyo is the capital of Japan.",
ground_truth="The capital of Japan is Tokyo.")

**Output format**

.. code-block:: python

{
"gleu_score": 0.41
}
"""

def __init__(self):
self._async_evaluator = _AsyncGleuScoreEvaluator()

def __call__(self, *, ground_truth: str, answer: str, **kwargs):
"""
Evaluate the GLEU score between the answer and the ground truth.

:keyword answer: The answer to be evaluated.
:paramtype answer: str
:keyword ground_truth: The ground truth to be compared against.
:paramtype ground_truth: str
:return: The GLEU score.
:rtype: dict
"""
return async_run_allowing_running_loop(
self._async_evaluator, ground_truth=ground_truth, answer=answer, **kwargs
)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._meteor import MeteorScoreEvaluator

__all__ = [
"MeteorScoreEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from nltk.translate.meteor_score import single_meteor_score

from promptflow._utils.async_utils import async_run_allowing_running_loop
from promptflow.evals._common.utils import nltk_tokenize


class _AsyncMeteorScoreEvaluator:
def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
self._alpha = alpha
self._beta = beta
self._gamma = gamma

async def __call__(self, *, ground_truth: str, answer: str, **kwargs):
reference_tokens = nltk_tokenize(ground_truth)
hypothesis_tokens = nltk_tokenize(answer)

score = single_meteor_score(
reference_tokens,
hypothesis_tokens,
alpha=self._alpha,
beta=self._beta,
gamma=self._gamma,
)

return {
"meteor_score": score,
}


class MeteorScoreEvaluator:
"""
Evaluator that computes the METEOR Score between two strings.

The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
other metrics like BLEU by considering synonyms, stemming, and paraphrasing. METEOR score considers synonyms and
word stems to more accurately capture meaning and language variations. In addition to machine translation and
text summarization, paraphrase detection is an optimal use case for the METEOR score.

:param alpha: The METEOR score alpha parameter. Default is 0.9.
:type alpha: float
:param beta: The METEOR score beta parameter. Default is 3.0.
:type beta: float
:param gamma: The METEOR score gamma parameter. Default is 0.5.
:type gamma: float

**Usage**

.. code-block:: python

eval_fn = MeteorScoreEvaluator(
alpha=0.9,
beta=3.0,
gamma=0.5
)
result = eval_fn(
answer="Tokyo is the capital of Japan.",
ground_truth="The capital of Japan is Tokyo.")

**Output format**

.. code-block:: python

{
"meteor_score": 0.62
}
"""

def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
ninghu marked this conversation as resolved.
Show resolved Hide resolved
self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)

def __call__(self, *, ground_truth: str, answer: str, **kwargs):
"""
Evaluate the METEOR score between the answer and the ground truth.

:keyword answer: The answer to be evaluated.
:paramtype answer: str
:keyword ground_truth: The ground truth to be compared against.
:paramtype ground_truth: str
:return: The METEOR score.
:rtype: dict
"""
return async_run_allowing_running_loop(
self._async_evaluator, ground_truth=ground_truth, answer=answer, **kwargs
)

def _to_async(self):
return self._async_evaluator
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

from ._rouge import RougeScoreEvaluator, RougeType

__all__ = [
"RougeScoreEvaluator",
"RougeType",
]
Loading
Loading