From 27cf8ee4fffb1c3c6bc1bd55e4df57df26fc623b Mon Sep 17 00:00:00 2001 From: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com> Date: Fri, 24 Nov 2023 11:07:02 +0100 Subject: [PATCH] Docs: Update Reader's doc strings (#6312) * Update doc strings * Add warm_up docs, fix margins --------- Co-authored-by: Vladimir Blagojevic --- .../preview/components/readers/extractive.py | 80 +++++++++++++------ 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/haystack/preview/components/readers/extractive.py b/haystack/preview/components/readers/extractive.py index 7fbe5d3ad1..5c763eef94 100644 --- a/haystack/preview/components/readers/extractive.py +++ b/haystack/preview/components/readers/extractive.py @@ -20,8 +20,10 @@ @component class ExtractiveReader: """ - A component for performing extractive QA. - Every possible answer span is assigned a confidence score independent of other answer spans. This fixes a common issue of other implementations which make comparisons across documents harder by normalising each document's answers independently. + A component that locates and extract answers to a given query from Documents. It's used for performing extractive + QA. The Reader assigns a probability score to every possible answer span independently of other answer spans. + This fixes a common issue of other implementations which make comparisons across documents harder by normalizing + each document's answers independently. Example usage: ```python @@ -51,29 +53,30 @@ def __init__( ) -> None: """ Creates an ExtractiveReader - :param model_name_or_path: A HuggingFace transformers question answering model. - Can either be a path to a folder containing the model files or an identifier for the HF hub + :param model_name_or_path: A Hugging Face transformers question answering model. + Can either be a path to a folder containing the model files or an identifier for the Hugging Face hub. Default: `'deepset/roberta-base-squad2-distilled'` - :param device: Pytorch device string. Uses GPU by default if available + :param device: Pytorch device string. Uses GPU by default, if available. :param token: The API token used to download private models from Hugging Face. If this parameter is set to `True`, then the token generated when running - `transformers-cli login` (stored in ~/.huggingface) will be used. + `transformers-cli login` (stored in ~/.huggingface) is used. :param top_k: Number of answers to return per query. It is required even if confidence_threshold is set. Defaults to 20. - An additional answer is returned if no_answer is set to True (default). - :param confidence_threshold: Answers with a confidence score below this value will not be returned + An additional answer with no text is returned if no_answer is set to True (default). + :param confidence_threshold: Returns only answers with the probability score above this threshold. :param max_seq_length: Maximum number of tokens. - If exceeded by a sequence, the sequence will be split. + If a sequence exceeds it, the sequence is split. Default: 384 - :param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length + :param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length. Default: 128 - :param max_batch_size: Maximum number of samples that are fed through the model at the same time + :param max_batch_size: Maximum number of samples that are fed through the model at the same time. :param answers_per_seq: Number of answer candidates to consider per sequence. - This is relevant when a document has been split into multiple sequence due to max_seq_length. - :param no_answer: Whether to return no answer scores - :param calibration_factor: Factor used for calibrating confidence scores + This is relevant when a Document was split into multiple sequences because of max_seq_length. + :param no_answer: Whether to return no answer scores. + :param calibration_factor: Factor used for calibrating probability scores. :param model_kwargs: Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained` - when loading the model specified in `model_name_or_path`. + when loading the model specified in `model_name_or_path`. For details on what kwargs you can pass, + see the model's documentation. """ torch_and_transformers_import.check() self.model_name_or_path = str(model_name_or_path) @@ -117,6 +120,9 @@ def to_dict(self) -> Dict[str, Any]: ) def warm_up(self): + """ + Loads model and tokenizer + """ if self.model is None: if torch.cuda.is_available(): self.device = self.device or "cuda:0" @@ -138,7 +144,7 @@ def _flatten_documents( self, queries: List[str], documents: List[List[Document]] ) -> Tuple[List[str], List[Document], List[int]]: """ - Flattens queries and documents so all query-document pairs are arranged along one batch axis. + Flattens queries and Documents so all query-document pairs are arranged along one batch axis. """ flattened_queries = [query for documents_, query in zip(documents, queries) for _ in documents_] flattened_documents = [document for documents_ in documents for document in documents_] @@ -149,14 +155,15 @@ def _preprocess( self, queries: List[str], documents: List[Document], max_seq_length: int, query_ids: List[int], stride: int ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[Encoding], List[int], List[int]]: """ - Split and tokenise documents and preserve structures by returning mappings to query and document ids. + Split and tokenize Documents and preserve structures by returning mappings to query and Document IDs. """ texts = [] document_ids = [] for i, doc in enumerate(documents): if doc.content is None: warnings.warn( - f"Document with id {doc.id} was passed to ExtractiveReader, but does not contain any text. It will be ignored." + f"Document with id {doc.id} was passed to ExtractiveReader. The Document doesn't " + f"contain any text and it will be ignored." ) continue texts.append(doc.content) @@ -195,7 +202,9 @@ def _postprocess( encodings: List[Encoding], ) -> Tuple[List[List[int]], List[List[int]], torch.Tensor]: """ - Turn start and end logits into confidence scores for each answer span. Unlike most other implementations, there is no normalisation in each split to make the scores more comparable across different splits. The top k answer spans are returned. + Turn start and end logits into probability scores for each answer span. Unlike most other + implementations, it doesn't normalize the scores to make them easier to compare across different + splits. Returns the top k answer spans. """ mask = sequence_ids == 1 mask = torch.logical_and(mask, attention_mask == 1) @@ -269,7 +278,10 @@ def _nest_answers( no_answer: bool, ) -> List[List[ExtractedAnswer]]: """ - Reconstructs the nested structure that existed before flattening. Also computes a no answer probability. This probability is different from most other implementations because it does not consider the no answer logit introduced with SQuAD 2. Instead, it just computes the probability that the answer does not exist in the top k or top p. + Reconstructs the nested structure that existed before flattening. Also computes a no answer probability. + This probability is different from most other implementations because it does not consider the no answer + logit introduced with SQuAD 2. Instead, it just computes the probability that the answer does not exist + in the top k or top p. """ flat_answers_without_queries = [] for document_id, start_candidates_, end_candidates_, probabilities_ in zip( @@ -277,7 +289,18 @@ def _nest_answers( ): for start_, end_, probability in zip(start_candidates_, end_candidates_, probabilities_): doc = flattened_documents[document_id] - flat_answers_without_queries.append({"data": doc.content[start_:end_], "document": doc, "probability": probability.item(), "start": start_, "end": end_, "metadata": {}}) # type: ignore # doc.content cannot be None, because those documents are filtered when preprocessing. However, mypy doesn't know that. + # doc.content cannot be None, because those documents are filtered when preprocessing. + # However, mypy doesn't know that. + flat_answers_without_queries.append( + { + "data": doc.content[start_:end_], # type: ignore + "document": doc, + "probability": probability.item(), + "start": start_, + "end": end_, + "metadata": {}, + } + ) i = 0 nested_answers = [] for query_id in range(query_ids[-1] + 1): @@ -316,13 +339,24 @@ def run( no_answer: Optional[bool] = None, ): """ - Performs extractive QA on the given documents using the given query. + Locates and extracts answers from the given Documents using the given query. :param query: Query string. - :param documents: List of Documents to search for an answer to the query. + :param documents: List of Documents in which you want to search for an answer to the query. :param top_k: The maximum number of answers to return. An additional answer is returned if no_answer is set to True (default). + :param confidence_threshold: :return: List of ExtractedAnswers sorted by (desc.) answer score. + :param confidence_threshold: Returns only answers with the probability score above this threshold. + :param max_seq_length: Maximum number of tokens. + If a sequence exceeds it, the sequence is split. + Default: 384 + :param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length. + Default: 128 + :param max_batch_size: Maximum number of samples that are fed through the model at the same time. + :param answers_per_seq: Number of answer candidates to consider per sequence. + This is relevant when a Document was split into multiple sequences because of max_seq_length. + :param no_answer: Whether to return no answer scores. """ queries = [query] # Temporary solution until we have decided what batching should look like in v2 nested_documents = [documents]