Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metadata superset #12

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions q2_katharoseq/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,22 +109,30 @@ def read_count_threshold(
positive_control_column = positive_control_column.to_series()
cell_count_column = cell_count_column.to_series()

# # FILTER COLUMNS
if set(positive_control_column.index) != set(cell_count_column.index):
raise ValueError("Columns have different samples")

md_samples = set(positive_control_column.index)
tab_samples = set(table.index)
if not md_samples.issuperset(tab_samples):
raise ValueError("Table contains samples not in metadata")

# overlap to what's present in the feature table
positive_control_column = positive_control_column.loc[list(tab_samples)]
cell_count_column = cell_count_column.loc[list(tab_samples)]

# FILTER COLUMNS
positive_controls = positive_control_column[
positive_control_column == positive_control_value]

if not positive_controls.shape[0]:
raise ValueError('No positive controls found in ' +
'positive control column.')
'positive control column which are in the table.')
positive_controls = pd.Series(positive_controls)

cell_counts = cell_count_column.loc[positive_controls.index]

# # CHECK SHAPES
inds = positive_controls.index.intersection(table.index)
print(inds)
if len(inds) == 0:
raise KeyError('No positive controls found in table.')
table_positive = table.loc[inds]

if threshold > 100 or threshold < 0:
Expand All @@ -135,9 +143,6 @@ def read_count_threshold(
# READS IN HIGHEST INPUT SAMPLE
max_cell_counts = cell_counts.idxmax()

if max_cell_counts not in df.index.values:
raise KeyError('No positive controls found in table.')

max_input = df.loc[max_cell_counts]
max_inputT = max_input.T
max_inputT = max_inputT.sort_values(ascending=False).head(10)
Expand Down
51 changes: 37 additions & 14 deletions q2_katharoseq/tests/test_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,40 @@ def setUp(self):
folder = '../../example'
self.fp = join(dirname(abspath(getfile(currentframe()))), folder)

def test_read_count_threshold_works_with_metadata_superset(self):
# make sure we work when a katharoseq control isn't present in the
# table
pos_control_col = self.positive_control_column.to_series().copy()
pos_control_col_name = pos_control_col.name
pos_control_col_idxname = pos_control_col.index.name
pos_control_col = pd.concat([pos_control_col,
pd.Series(['a', ], index=['foo', ])])
pos_control_col.name = pos_control_col_name
pos_control_col.index.name = pos_control_col_idxname
pos_control_col = CategoricalMetadataColumn(pos_control_col)

cell_count_col = self.cell_count_column.to_series().copy()
cell_count_col_name = cell_count_col.name
cell_count_col_idxname = cell_count_col.index.name
cell_count_col = pd.concat([cell_count_col,
pd.Series([1000000, ], index=['foo', ])])
cell_count_col.name = cell_count_col_name
cell_count_col.index.name = cell_count_col_idxname
cell_count_col = NumericMetadataColumn(cell_count_col)

with tempfile.TemporaryDirectory() as output_dir:
read_count_threshold(
output_dir,
self.threshold,
self.positive_control_value,
pos_control_col,
cell_count_col,
self.table,
self.control)

index_fp = os.path.join(output_dir, 'index.html')
self.assertTrue(os.path.exists(index_fp))

def test_outputs_index(self):
with tempfile.TemporaryDirectory() as output_dir:
read_count_threshold(
Expand Down Expand Up @@ -129,23 +163,12 @@ def test_no_positive_controls_in_col(self):
self.control)

def test_no_positive_controls_in_table(self):

ind = pd.Index(
['s5', 's6', 's7', 's8'],
name='sampleid')
table = pd.DataFrame(
[[0, 1, 2, 3],
[0, 1, 2, 3],
[5, 4, 3, 2],
[7, 2, 3, 4]],
index=ind, # change index
columns=['f1', 'f2', 'f3', 'f4'])
table = self.table.loc[['s2', 's4']]

with tempfile.TemporaryDirectory() as output_dir, \
self.assertRaisesRegex(
KeyError,
'No positive controls found '
'in table.'):
ValueError,
'No positive controls found'):

read_count_threshold(
output_dir,
Expand Down
Loading