From f2b5f123b32a52321560c4ac476277e4d851b845 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 13 Dec 2024 09:50:23 +0100 Subject: [PATCH 1/4] del HF token in tests (#8634) --- .../classifiers/test_zero_shot_document_classifier.py | 2 ++ test/components/generators/chat/test_hugging_face_local.py | 1 + .../components/generators/test_hugging_face_local_generator.py | 1 + .../components/rankers/test_sentence_transformers_diversity.py | 2 +- test/components/rankers/test_transformers_similarity.py | 1 + test/components/readers/test_extractive.py | 3 +++ test/components/routers/test_transformers_text_router.py | 3 +++ test/components/routers/test_zero_shot_text_router.py | 2 ++ 8 files changed, 14 insertions(+), 1 deletion(-) diff --git a/test/components/classifiers/test_zero_shot_document_classifier.py b/test/components/classifiers/test_zero_shot_document_classifier.py index 7d679e3d21..be4d04a9fe 100644 --- a/test/components/classifiers/test_zero_shot_document_classifier.py +++ b/test/components/classifiers/test_zero_shot_document_classifier.py @@ -45,6 +45,7 @@ def test_to_dict(self): def test_from_dict(self, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.classifiers.zero_shot_document_classifier.TransformersZeroShotDocumentClassifier", "init_parameters": { @@ -73,6 +74,7 @@ def test_from_dict(self, monkeypatch): def test_from_dict_no_default_parameters(self, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.classifiers.zero_shot_document_classifier.TransformersZeroShotDocumentClassifier", "init_parameters": {"model": "cross-encoder/nli-deberta-v3-xsmall", "labels": ["positive", "negative"]}, diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index 433917ec23..8f6749c2d8 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -166,6 +166,7 @@ def test_from_dict(self, model_info_mock): @patch("haystack.components.generators.chat.hugging_face_local.pipeline") def test_warm_up(self, pipeline_mock, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) generator = HuggingFaceLocalChatGenerator( model="mistralai/Mistral-7B-Instruct-v0.2", task="text2text-generation", diff --git a/test/components/generators/test_hugging_face_local_generator.py b/test/components/generators/test_hugging_face_local_generator.py index 5c3b162a31..bded2e8d47 100644 --- a/test/components/generators/test_hugging_face_local_generator.py +++ b/test/components/generators/test_hugging_face_local_generator.py @@ -18,6 +18,7 @@ class TestHuggingFaceLocalGenerator: @patch("haystack.utils.hf.model_info") def test_init_default(self, model_info_mock, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) model_info_mock.return_value.pipeline_tag = "text2text-generation" generator = HuggingFaceLocalGenerator() diff --git a/test/components/rankers/test_sentence_transformers_diversity.py b/test/components/rankers/test_sentence_transformers_diversity.py index eabd2ac375..018b443987 100644 --- a/test/components/rankers/test_sentence_transformers_diversity.py +++ b/test/components/rankers/test_sentence_transformers_diversity.py @@ -273,7 +273,7 @@ def test_warm_up(self, similarity, monkeypatch): Test that ranker loads the SentenceTransformer model correctly during warm up. """ monkeypatch.delenv("HF_API_TOKEN", raising=False) - + monkeypatch.delenv("HF_TOKEN", raising=False) mock_model_class = MagicMock() mock_model_instance = MagicMock() mock_model_class.return_value = mock_model_instance diff --git a/test/components/rankers/test_transformers_similarity.py b/test/components/rankers/test_transformers_similarity.py index 6031d85e15..616bfa6647 100644 --- a/test/components/rankers/test_transformers_similarity.py +++ b/test/components/rankers/test_transformers_similarity.py @@ -313,6 +313,7 @@ def test_device_map_and_device_raises(self, caplog): @patch("haystack.components.rankers.transformers_similarity.AutoModelForSequenceClassification.from_pretrained") def test_device_map_dict(self, mocked_automodel, _mocked_autotokenizer, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) ranker = TransformersSimilarityRanker("model", model_kwargs={"device_map": {"layer_1": 1, "classifier": "cpu"}}) class MockedModel: diff --git a/test/components/readers/test_extractive.py b/test/components/readers/test_extractive.py index aedfaa13bc..a2f658b79b 100644 --- a/test/components/readers/test_extractive.py +++ b/test/components/readers/test_extractive.py @@ -519,6 +519,7 @@ def __init__(self): @patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained") def test_device_map_auto(mocked_automodel, _mocked_autotokenizer, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) reader = ExtractiveReader("deepset/roberta-base-squad2", model_kwargs={"device_map": "auto"}) auto_device = ComponentDevice.resolve_device(None) @@ -537,6 +538,7 @@ def __init__(self): @patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained") def test_device_map_str(mocked_automodel, _mocked_autotokenizer, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) reader = ExtractiveReader("deepset/roberta-base-squad2", model_kwargs={"device_map": "cpu:0"}) class MockedModel: @@ -554,6 +556,7 @@ def __init__(self): @patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained") def test_device_map_dict(mocked_automodel, _mocked_autotokenizer, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) reader = ExtractiveReader( "deepset/roberta-base-squad2", model_kwargs={"device_map": {"layer_1": 1, "classifier": "cpu"}} ) diff --git a/test/components/routers/test_transformers_text_router.py b/test/components/routers/test_transformers_text_router.py index 8a0dca8d63..67ec163524 100644 --- a/test/components/routers/test_transformers_text_router.py +++ b/test/components/routers/test_transformers_text_router.py @@ -54,6 +54,7 @@ def test_to_dict_with_cpu_device(self, mock_auto_config_from_pretrained): def test_from_dict(self, mock_auto_config_from_pretrained, monkeypatch): mock_auto_config_from_pretrained.return_value = MagicMock(label2id={"en": 0, "de": 1}) monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.routers.transformers_text_router.TransformersTextRouter", "init_parameters": { @@ -84,6 +85,7 @@ def test_from_dict(self, mock_auto_config_from_pretrained, monkeypatch): def test_from_dict_no_default_parameters(self, mock_auto_config_from_pretrained, monkeypatch): mock_auto_config_from_pretrained.return_value = MagicMock(label2id={"en": 0, "de": 1}) monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.routers.transformers_text_router.TransformersTextRouter", "init_parameters": {"model": "papluca/xlm-roberta-base-language-detection"}, @@ -105,6 +107,7 @@ def test_from_dict_no_default_parameters(self, mock_auto_config_from_pretrained, def test_from_dict_with_cpu_device(self, mock_auto_config_from_pretrained, monkeypatch): mock_auto_config_from_pretrained.return_value = MagicMock(label2id={"en": 0, "de": 1}) monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.routers.transformers_text_router.TransformersTextRouter", "init_parameters": { diff --git a/test/components/routers/test_zero_shot_text_router.py b/test/components/routers/test_zero_shot_text_router.py index 8e9759f361..3b931c39bb 100644 --- a/test/components/routers/test_zero_shot_text_router.py +++ b/test/components/routers/test_zero_shot_text_router.py @@ -28,6 +28,7 @@ def test_to_dict(self): def test_from_dict(self, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.routers.zero_shot_text_router.TransformersZeroShotTextRouter", "init_parameters": { @@ -56,6 +57,7 @@ def test_from_dict(self, monkeypatch): def test_from_dict_no_default_parameters(self, monkeypatch): monkeypatch.delenv("HF_API_TOKEN", raising=False) + monkeypatch.delenv("HF_TOKEN", raising=False) data = { "type": "haystack.components.routers.zero_shot_text_router.TransformersZeroShotTextRouter", "init_parameters": {"labels": ["query", "passage"]}, From 176db5dbf9d5be87122e3feafa19593fed418cde Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 13 Dec 2024 12:12:40 +0100 Subject: [PATCH 2/4] initial import (#8635) --- e2e/pipelines/test_dense_doc_search.py | 2 +- e2e/pipelines/test_preprocessing_pipeline.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/e2e/pipelines/test_dense_doc_search.py b/e2e/pipelines/test_dense_doc_search.py index 39a587a106..f348b6f0e5 100644 --- a/e2e/pipelines/test_dense_doc_search.py +++ b/e2e/pipelines/test_dense_doc_search.py @@ -26,7 +26,7 @@ def test_dense_doc_search_pipeline(tmp_path, samples_path): indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner") indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") indexing_pipeline.add_component( - instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter" + instance=DocumentSplitter(split_by="period", split_length=250, split_overlap=30), name="splitter" ) indexing_pipeline.add_component( instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder" diff --git a/e2e/pipelines/test_preprocessing_pipeline.py b/e2e/pipelines/test_preprocessing_pipeline.py index 82375f89d8..8894113913 100644 --- a/e2e/pipelines/test_preprocessing_pipeline.py +++ b/e2e/pipelines/test_preprocessing_pipeline.py @@ -25,9 +25,7 @@ def test_preprocessing_pipeline(tmp_path): instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router" ) preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") - preprocessing_pipeline.add_component( - instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter" - ) + preprocessing_pipeline.add_component(instance=DocumentSplitter(split_by="period", split_length=1), name="splitter") preprocessing_pipeline.add_component( instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder" ) From db89b9a2e59da5a0fe59135ef4ab1f6252e2a7db Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 13 Dec 2024 12:35:58 +0100 Subject: [PATCH 3/4] fix: removing unused import (#8636) --- e2e/pipelines/test_preprocessing_pipeline.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/e2e/pipelines/test_preprocessing_pipeline.py b/e2e/pipelines/test_preprocessing_pipeline.py index 8894113913..4667454276 100644 --- a/e2e/pipelines/test_preprocessing_pipeline.py +++ b/e2e/pipelines/test_preprocessing_pipeline.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import json - from haystack import Pipeline from haystack.components.classifiers import DocumentLanguageClassifier from haystack.components.converters import TextFileToDocument From a5b57f4b1fd4ef4227d7d54170f99b142836a04c Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 16 Dec 2024 13:57:41 +0100 Subject: [PATCH 4/4] adding SentenceSplitter to init imports (#8644) --- haystack/components/preprocessors/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/__init__.py b/haystack/components/preprocessors/__init__.py index f7e132077a..467f16ceeb 100644 --- a/haystack/components/preprocessors/__init__.py +++ b/haystack/components/preprocessors/__init__.py @@ -5,6 +5,7 @@ from .document_cleaner import DocumentCleaner from .document_splitter import DocumentSplitter from .nltk_document_splitter import NLTKDocumentSplitter +from .sentence_tokenizer import SentenceSplitter from .text_cleaner import TextCleaner -__all__ = ["DocumentSplitter", "DocumentCleaner", "TextCleaner", "NLTKDocumentSplitter"] +__all__ = ["DocumentSplitter", "DocumentCleaner", "NLTKDocumentSplitter", "SentenceSplitter", "TextCleaner"]