From c9fe7b7ad47f4798b8888716531689e13acc974f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 16 Jan 2024 12:09:47 -0800 Subject: [PATCH 1/2] BUG: read_count_threshold assumes samples in metadata are present in table --- q2_katharoseq/tests/test_method.py | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/q2_katharoseq/tests/test_method.py b/q2_katharoseq/tests/test_method.py index 17b6891..a548c3e 100644 --- a/q2_katharoseq/tests/test_method.py +++ b/q2_katharoseq/tests/test_method.py @@ -58,6 +58,40 @@ def setUp(self): folder = '../../example' self.fp = join(dirname(abspath(getfile(currentframe()))), folder) + def test_read_count_threshold_works_with_metadata_superset(self): + # make sure we work when a katharoseq control isn't present in the + # table + pos_control_col = self.positive_control_column.to_series().copy() + pos_control_col_name = pos_control_col.name + pos_control_col_idxname = pos_control_col.index.name + pos_control_col = pd.concat([pos_control_col, + pd.Series(['a', ], index=['foo', ])]) + pos_control_col.name = pos_control_col_name + pos_control_col.index.name = pos_control_col_idxname + pos_control_col = CategoricalMetadataColumn(pos_control_col) + + cell_count_col = self.cell_count_column.to_series().copy() + cell_count_col_name = cell_count_col.name + cell_count_col_idxname = cell_count_col.index.name + cell_count_col = pd.concat([cell_count_col, + pd.Series([1000000, ], index=['foo', ])]) + cell_count_col.name = cell_count_col_name + cell_count_col.index.name = cell_count_col_idxname + cell_count_col = NumericMetadataColumn(cell_count_col) + + with tempfile.TemporaryDirectory() as output_dir: + read_count_threshold( + output_dir, + self.threshold, + self.positive_control_value, + pos_control_col, + cell_count_col, + self.table, + self.control) + + index_fp = os.path.join(output_dir, 'index.html') + self.assertTrue(os.path.exists(index_fp)) + def test_outputs_index(self): with tempfile.TemporaryDirectory() as output_dir: read_count_threshold( From bba25a2c5699065916c23cb3f84d5493249910eb Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 16 Jan 2024 12:21:04 -0800 Subject: [PATCH 2/2] BUG: allow metadata to be a superset of the table --- q2_katharoseq/_methods.py | 23 ++++++++++++++--------- q2_katharoseq/tests/test_method.py | 17 +++-------------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/q2_katharoseq/_methods.py b/q2_katharoseq/_methods.py index c641c11..4f609e3 100644 --- a/q2_katharoseq/_methods.py +++ b/q2_katharoseq/_methods.py @@ -109,22 +109,30 @@ def read_count_threshold( positive_control_column = positive_control_column.to_series() cell_count_column = cell_count_column.to_series() - # # FILTER COLUMNS + if set(positive_control_column.index) != set(cell_count_column.index): + raise ValueError("Columns have different samples") + + md_samples = set(positive_control_column.index) + tab_samples = set(table.index) + if not md_samples.issuperset(tab_samples): + raise ValueError("Table contains samples not in metadata") + + # overlap to what's present in the feature table + positive_control_column = positive_control_column.loc[list(tab_samples)] + cell_count_column = cell_count_column.loc[list(tab_samples)] + + # FILTER COLUMNS positive_controls = positive_control_column[ positive_control_column == positive_control_value] if not positive_controls.shape[0]: raise ValueError('No positive controls found in ' + - 'positive control column.') + 'positive control column which are in the table.') positive_controls = pd.Series(positive_controls) cell_counts = cell_count_column.loc[positive_controls.index] - # # CHECK SHAPES inds = positive_controls.index.intersection(table.index) - print(inds) - if len(inds) == 0: - raise KeyError('No positive controls found in table.') table_positive = table.loc[inds] if threshold > 100 or threshold < 0: @@ -135,9 +143,6 @@ def read_count_threshold( # READS IN HIGHEST INPUT SAMPLE max_cell_counts = cell_counts.idxmax() - if max_cell_counts not in df.index.values: - raise KeyError('No positive controls found in table.') - max_input = df.loc[max_cell_counts] max_inputT = max_input.T max_inputT = max_inputT.sort_values(ascending=False).head(10) diff --git a/q2_katharoseq/tests/test_method.py b/q2_katharoseq/tests/test_method.py index a548c3e..dfb581d 100644 --- a/q2_katharoseq/tests/test_method.py +++ b/q2_katharoseq/tests/test_method.py @@ -163,23 +163,12 @@ def test_no_positive_controls_in_col(self): self.control) def test_no_positive_controls_in_table(self): - - ind = pd.Index( - ['s5', 's6', 's7', 's8'], - name='sampleid') - table = pd.DataFrame( - [[0, 1, 2, 3], - [0, 1, 2, 3], - [5, 4, 3, 2], - [7, 2, 3, 4]], - index=ind, # change index - columns=['f1', 'f2', 'f3', 'f4']) + table = self.table.loc[['s2', 's4']] with tempfile.TemporaryDirectory() as output_dir, \ self.assertRaisesRegex( - KeyError, - 'No positive controls found ' - 'in table.'): + ValueError, + 'No positive controls found'): read_count_threshold( output_dir,