Skip to content

Commit

Permalink
Added tests for feature selection component (#149)
Browse files Browse the repository at this point in the history
* Initial commit with working tests

* cleaned and ensured the file is working with pre-commit

* Testing checkpoint

* Added support for module_file

* Improved documentation

* Fixed test

* Fixed pre-commit errors

* Added data files for component_test.py

* Added tests for artifact count by type

* Fixed minor bug

* Added test to check if correct features are being selected

* Update dependencies for feature_selection

* Update tfx_addons/version.py

* Update tfx_addons/version.py

* Update tfx_addons/version.py

Co-authored-by: Gerard Casas Saez <gcasassaez@twitter.com>
  • Loading branch information
deutranium and casassg committed Aug 22, 2022
1 parent 2fe23e0 commit d4628c4
Show file tree
Hide file tree
Showing 5 changed files with 333 additions and 36 deletions.
40 changes: 33 additions & 7 deletions tfx_addons/feature_selection/component.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -93,20 +93,46 @@ def _get_file_list(dir_path):

@component
def FeatureSelection( # pylint: disable=C0103
module_file: Parameter[str], orig_examples: InputArtifact[Examples],
orig_examples: InputArtifact[Examples],
feature_selection: OutputArtifact[FeatureSelectionArtifact],
updated_data: OutputArtifact[Examples]):
"""Feature Selection component
Args (from the module file):
updated_data: OutputArtifact[Examples],
module_file: Parameter[str] = None,
module_path: Parameter[str] = None,
):
"""Runs a user-specified feature selection algorithm on an `Examples` artifact
Args:
- orig_examples: An `Examples` input artifact with the data to be
processed
- module_file: Python module file containing the configuration
Example: `modules_files.module_file_a`
Exactly one of `module_file` and `module_path` should be passed.
If both are used, module_file would be preferred
- module_path: Python module path containing the configuration
Example: `absolute_path/module_files/module_file_a.py` or
`./module_files/module_file_a.py`
Exactly one of `module_file` and `module_path` should be passed.
If both are used, module_file would be preferred
Module file configuration:
- SELECTOR_PARAMS: Parameters for SelectorFunc in the form of
a kwargs dictionary
Example: {"score_func": chi2, "k": 2}
Here, `chi2` has been imported from sklearn.feature_selection
- TARGET_FEATURE: Name of the feature containing target data
- SelectorFunc: Selector function for univariate feature selection
example: SelectKBest, SelectPercentile from sklearn.feature_selection
Example: SelectKBest, SelectPercentile from sklearn.feature_selection
"""

# importing the required functions and variables from the module file
modules = importlib.import_module(module_file)

if module_file:
modules = importlib.import_module(module_file)
elif module_path:
module_spec = importlib.util.spec_from_file_location(
"all_modules", module_path)
modules = importlib.util.module_from_spec(module_spec)
module_spec.loader.exec_module(modules)

mod_names = ["SELECTOR_PARAMS", "TARGET_FEATURE", "SelectorFunc"]
selector_params, target_feature, selector_func = [
getattr(modules, i) for i in mod_names
Expand Down
169 changes: 143 additions & 26 deletions tfx_addons/feature_selection/component_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -12,38 +12,155 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for HelloComponent."""
"""Tests for tfx_addons.feature_selection.component"""

import json
import csv
import importlib
import os
from typing import List, Optional, Text

import tensorflow as tf
from tfx.examples.custom_components.hello_world.hello_component import \
component
from tfx.types import artifact, channel_utils, standard_artifacts
import tfx
from tfx.orchestration import metadata

from tfx_addons.feature_selection import component

class HelloComponentTest(tf.test.TestCase):

def _get_selected_features(module_file, data_path):
"""Get the correct selected features for testing"""

data = []

# importing required configurations
modules = importlib.import_module(module_file)
mod_names = ["SELECTOR_PARAMS", "TARGET_FEATURE", "SelectorFunc"]
selector_params, target_feature, selector_func = [
getattr(modules, i) for i in mod_names
]

# getting the data
with open(data_path, 'r') as file:
my_reader = csv.reader(file, delimiter=',')
for row in my_reader:
data.append(row)

# splitting X (input) and Y (output) from CSV data
target_idx = data[0].index(target_feature)
target_data = [i.pop(target_idx) for i in data]

# runnign the selector function for feature selection
selector = selector_func(**selector_params)
selector.fit_transform(data[1:], target_data[1:])

# getting selected feature names
selected_indices = selector.get_support(indices=True)
final_features = set(data[0][idx] for idx in selected_indices)

return final_features


def _create_pipeline(
pipeline_name: Text,
pipeline_root: Text,
data_root: Text,
module_path: Text,
metadata_path: Text,
beam_pipeline_args: Optional[List[Text]] = None) -> tfx.v1.dsl.Pipeline:
"""Creating sample pipeline with two components: CsvExampleGen and
FeatureSelection"""

# specifying the pipeline components
example_gen = tfx.components.CsvExampleGen(input_base=data_root)
feature_selection = component.FeatureSelection(
orig_examples=example_gen.outputs['examples'], module_path=module_path)

components = [example_gen, feature_selection]

return tfx.v1.dsl.Pipeline(
pipeline_name=pipeline_name,
pipeline_root=pipeline_root,
components=components,
metadata_connection_config=metadata.sqlite_metadata_connection_config(
metadata_path),
beam_pipeline_args=beam_pipeline_args)


class FeatureSelectionTest(tf.test.TestCase):
def setUp(self):
super(HelloComponentTest, self).setUp()
self.name = 'HelloWorld'

def testConstruct(self):
input_data = standard_artifacts.Examples()
input_data.split_names = json.dumps(artifact.DEFAULT_EXAMPLE_SPLITS)
output_data = standard_artifacts.Examples()
output_data.split_names = json.dumps(artifact.DEFAULT_EXAMPLE_SPLITS)
this_component = component.HelloComponent(
input_data=channel_utils.as_channel([input_data]),
output_data=channel_utils.as_channel([output_data]),
name=u'Testing123')
self.assertEqual(standard_artifacts.Examples.TYPE_NAME,
this_component.outputs['output_data'].type_name)
artifact_collection = this_component.outputs['output_data'].get()
for artifacts in artifact_collection:
split_list = json.loads(artifacts.split_names)
self.assertEqual(artifact.DEFAULT_EXAMPLE_SPLITS.sort(),
split_list.sort())
super().setUp()
self._test_dir = os.path.join(
os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
self._testMethodName)
self._feature_selection_root = os.path.dirname(__file__)
self._pipeline_name = 'feature_selection'
self._data_root = os.path.join(self._feature_selection_root, 'test')
self._data_path = os.path.join(self._data_root, 'iris.csv')
self._module_path = os.path.join(self._feature_selection_root, 'example',
'modules', 'iris_module_file.py')
self._module_file = "tfx_addons.feature_selection.example.modules.\
iris_module_file"

self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines',
self._pipeline_name)
self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata',
self._pipeline_name, 'metadata.db')

def assertExecutedOnce(self, component: Text) -> None: # pylint: disable=W0621
"""Check the component is executed exactly once."""
component_path = os.path.join(self._pipeline_root, component)
self.assertTrue(tfx.dsl.io.fileio.exists(component_path))
execution_path = os.path.join(component_path, '.system',
'executor_execution')
execution = tfx.dsl.io.fileio.listdir(execution_path)
self.assertLen(execution, 1)

def assertPipelineExecution(self) -> None:
self.assertExecutedOnce('CsvExampleGen')
self.assertExecutedOnce('FeatureSelection')

def testFeatureSelectionPipelineLocal(self):
tfx.v1.orchestration.LocalDagRunner().run(
_create_pipeline(pipeline_name=self._pipeline_name,
pipeline_root=self._pipeline_root,
data_root=self._data_root,
module_path=self._module_path,
metadata_path=self._metadata_path))

expected_execution_count = 2 # one each for CsvExampleGen and Feature Selection
true_selected_features = _get_selected_features(self._module_file,
self._data_path)

metadata_config = (
tfx.orchestration.metadata.sqlite_metadata_connection_config(
self._metadata_path))
with metadata.Metadata(metadata_config) as m:
execution_count = len(m.store.get_executions())
selected_features_struct = list(
m.store.get_artifacts_by_type(
"Feature Selection")[0].properties["selected_features"].
struct_value.fields.values.__self__["__value__"].list_value.values)
component_selected_features = set(
feature.string_value for feature in selected_features_struct)

# TEST: execution count
self.assertEqual(expected_execution_count, execution_count)

# TEST: number of artifacts with TYPE_NAME `Feature Selection`
self.assertEqual(1,
len(m.store.get_artifacts_by_type("Feature Selection")))

# TEST: number of artifacts with TYPE_NAME `Examples`
# (one each from CsvExampleGen and FeatureSelection)
self.assertEqual(2, len(m.store.get_artifacts_by_type("Examples")))

# TEST: if the features selected by component are correct
self.assertEqual(component_selected_features, true_selected_features)

self.assertPipelineExecution()


if __name__ == '__main__':
tf.compat.v1.enable_v2_behavior()
tf.test.main()

# _disabled pylint warning `W0621: Redefining name 'component' from outer scope` till an alternate way is found
4 changes: 4 additions & 0 deletions tfx_addons/feature_selection/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
scikit_learn==1.1.2
tensorflow
tfx
tfx_bsl==1.9.0
Loading

0 comments on commit d4628c4

Please sign in to comment.