diff --git a/requirements.dev.txt b/requirements.dev.txt index 91de2350..457696b5 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -25,7 +25,7 @@ nbsphinx==0.8.8 sphinx_material==0.0.35 pytest>=6.2.5 pytest-cov>=2.8.1 -scikit-learn>=1.0.1,<1.4 +scikit-learn>=1.4.0 xgboost>=1.0.0 nbformat>4.2.0 numba>=0.53.1 diff --git a/setup.py b/setup.py index cfbd6f51..3dfc9325 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "dash-table>=5.0.0", "nbformat>4.2.0", "numba>=0.53.1", - "scikit-learn>=1.0.1,<1.4", + "scikit-learn>=1.4.0", "category_encoders>=2.6.0", "scipy>=0.19.1", ] diff --git a/shapash/utils/columntransformer_backend.py b/shapash/utils/columntransformer_backend.py index 52fe4b6b..09a0ab09 100644 --- a/shapash/utils/columntransformer_backend.py +++ b/shapash/utils/columntransformer_backend.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd +from sklearn.preprocessing import FunctionTransformer from shapash.utils.category_encoder_backend import ( category_encoder_binary, @@ -91,7 +92,7 @@ def inv_transform_ct(x_in, encoding): # columns not encode elif name_encoding == "remainder": - if ct_encoding == "passthrough": + if isinstance(ct_encoding, FunctionTransformer): nb_col = len(col_encoding) frame = x_in.iloc[:, init : init + nb_col] else: @@ -249,7 +250,7 @@ def calc_inv_contrib_ct(x_contrib, encoding, agg_columns): init += nb_col elif name_encoding == "remainder": - if ct_encoding == "passthrough": + if isinstance(ct_encoding, FunctionTransformer): nb_col = len(col_encoding) frame = x_contrib.iloc[:, init : init + nb_col] rst = pd.concat([rst, frame], axis=1) @@ -366,7 +367,9 @@ def get_feature_names(column_transformer): List of returned features names when ColumnTransformer is applied. """ feature_names = [] - l_transformers = list(column_transformer._iter(fitted=True)) + l_transformers = list( + column_transformer._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True) + ) for name, trans, column, _ in l_transformers: feature_names.extend(get_names(name, trans, column, column_transformer)) @@ -463,11 +466,8 @@ def get_col_mapping_ct(encoder, x_encoded): else: raise NotImplementedError(f"Estimator not supported : {estimator}") - elif estimator == "passthrough": - try: - features_out = encoder.feature_names_in_[features] - except Exception: - features_out = encoder._feature_names_in[features] # for oldest sklearn version + elif isinstance(estimator, FunctionTransformer): + features_out = encoder.feature_names_in_[features] for f_name in features_out: dict_col_mapping[f_name] = [x_encoded.columns.to_list()[idx_encoded]] idx_encoded += 1 diff --git a/shapash/utils/transform.py b/shapash/utils/transform.py index 42ddf779..37f56655 100644 --- a/shapash/utils/transform.py +++ b/shapash/utils/transform.py @@ -1,10 +1,12 @@ """ Transform Module """ + import re import numpy as np import pandas as pd +from sklearn.preprocessing import FunctionTransformer from shapash.utils.category_encoder_backend import ( get_col_mapping_ce, @@ -185,7 +187,7 @@ def check_transformers(list_encoding): if (str(type(ct_encoding)) not in supported_sklearn) and ( str(type(ct_encoding)) not in supported_category_encoder ): - if str(type(ct_encoding)) != "": + if not isinstance(ct_encoding, str) and not isinstance(ct_encoding, FunctionTransformer): raise ValueError("One of the encoders used in ColumnTransformers isn't supported.") elif str(type(enc)) in supported_category_encoder: diff --git a/tests/unit_tests/utils/test_columntransformer_backend.py b/tests/unit_tests/utils/test_columntransformer_backend.py index 15541d6e..78a44d5f 100644 --- a/tests/unit_tests/utils/test_columntransformer_backend.py +++ b/tests/unit_tests/utils/test_columntransformer_backend.py @@ -1,6 +1,7 @@ """ Unit test of Inverse Transform """ + import unittest import catboost as cb @@ -959,25 +960,25 @@ def test_get_names_1(self): enc_4.fit(train) feature_names_1 = [] - l_transformers = list(enc_1._iter(fitted=True)) + l_transformers = list(enc_1._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_1.extend(get_names(name, trans, column, enc_1)) feature_names_2 = [] - l_transformers = list(enc_2._iter(fitted=True)) + l_transformers = list(enc_2._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_2.extend(get_names(name, trans, column, enc_2)) feature_names_3 = [] - l_transformers = list(enc_3._iter(fitted=True)) + l_transformers = list(enc_3._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_3.extend(get_names(name, trans, column, enc_3)) feature_names_4 = [] - l_transformers = list(enc_4._iter(fitted=True)) + l_transformers = list(enc_4._iter(fitted=True, column_as_labels=False, skip_drop=True, skip_empty_columns=True)) for name, trans, column, _ in l_transformers: feature_names_4.extend(get_names(name, trans, column, enc_4))