From c9e73a3be1f583e1cd9053e2a657420250b1add1 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Mon, 11 Dec 2023 06:45:56 +0100 Subject: [PATCH 1/3] [#186653738]: Add test that hits pairwise DOF * The test is introduced to exercise the execution path that hits the pairwise degrees of freedom calculation, in the case of the presence of the "squared weighted counts" cube measure. * The test will prevent regression to the standart unweighted counts, if the suqred ones are available. --- tests/integration/test_cubepart.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/test_cubepart.py b/tests/integration/test_cubepart.py index 1f68a9931..34a2723f7 100644 --- a/tests/integration/test_cubepart.py +++ b/tests/integration/test_cubepart.py @@ -1907,6 +1907,13 @@ def it_uses_squared_weights_for_effect_calculation(self): [NA, 0.0, 0.0, -0.9486833, NA, NA, NA, 1.8973666, -0.9486833, NA], ], ) + np.testing.assert_almost_equal( + slice_._measures.pairwise_p_vals(1).blocks[0][0], + [ + [NA, 1.0, 1.0, 0.66381998, NA, NA, NA, 0.19510957, 0.66381998, NA], + [NA, 1.0, 1.0, 0.66381998, NA, NA, NA, 0.19510957, 0.66381998, NA], + ], + ) class Describe_Strand: From 77e776f1b9577c38fc3a8155ae411c7bfa7283f5 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Mon, 11 Dec 2023 06:47:44 +0100 Subject: [PATCH 2/3] [#186653738]: Use effective column bases in DOF * If available use effective counts for DOF in pairwise comparison * Improve docstring for how bases are calculated --- src/cr/cube/matrix/measure.py | 49 ++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/src/cr/cube/matrix/measure.py b/src/cr/cube/matrix/measure.py index cf6a94eb9..a4f62df94 100644 --- a/src/cr/cube/matrix/measure.py +++ b/src/cr/cube/matrix/measure.py @@ -1523,8 +1523,28 @@ def _base_values(self): ) @lazyproperty - def _bases(self): - """2D array of 2D ndarray "blocks" for the column unweighted bases""" + def _column_bases(self): + """ + Calculate and return the 2D array of 2D ndarray "blocks" representing the + column bases for analysis. These bases are determined based on the presence + or absence of the 'squared weighted counts'. + + The method first checks if the 'squared weighted counts' measure is defined. + If it is, the method calculates the 'effective' counts. These are obtained by + squaring the unweighted counts and then dividing each by the corresponding + squared count. This calculation reflects the 'effective' sample size when + weighting is applied. + + If the 'squared weighted counts' measure does not exist, the standard + unweighted counts are used. These counts are represented as a simple 2D array + of 2D ndarray blocks without any modification, directly reflecting the raw, + unweighted counts. + + Returns: + numpy.ndarray: A 2D array of 2D ndarray counts, representing the + calculated column bases (either 'effective' or unweighted counts) for + the analysis. + """ unweighted_blocks = self._second_order_measures.column_unweighted_bases.blocks if self._second_order_measures.columns_squared_base.is_defined: squared_blocks = self._second_order_measures.column_squared_bases.blocks @@ -1556,10 +1576,10 @@ def _reference_values(self, block_index): col_idx = self._selected_column_idx if col_idx < 0: props = self._proportions[block_index][1] - bases = self._bases[block_index][1] + bases = self._column_bases[block_index][1] else: props = self._proportions[block_index][0] - bases = self._bases[block_index][0] + bases = self._column_bases[block_index][0] return (props[:, [col_idx]], bases[:, [col_idx]]) @@ -1586,7 +1606,7 @@ def _intersections(self): # --- Use "inserted" reference values for intersections (ref_props, ref_variance) = self._reference_values(1) return self._calculate_t_stats( - self._proportions[1][1], self._bases[1][1], ref_props, ref_variance + self._proportions[1][1], self._column_bases[1][1], ref_props, ref_variance ) @lazyproperty @@ -1600,7 +1620,7 @@ def _subtotal_columns(self): # --- Use "body" reference values for inserted columns (ref_props, ref_variance) = self._reference_values(0) return self._calculate_t_stats( - self._proportions[0][1], self._bases[0][1], ref_props, ref_variance + self._proportions[0][1], self._column_bases[0][1], ref_props, ref_variance ) @lazyproperty @@ -1609,7 +1629,7 @@ def _subtotal_rows(self): # --- Use "inserted" reference values for inserted rows (ref_props, ref_variance) = self._reference_values(1) return self._calculate_t_stats( - self._proportions[1][0], self._bases[1][0], ref_props, ref_variance + self._proportions[1][0], self._column_bases[1][0], ref_props, ref_variance ) @@ -1624,18 +1644,18 @@ def blocks(self): """2D array of the four 2D "blocks" making up this measure.""" col_idx = self._selected_column_idx t_stats = self._second_order_measures.pairwise_t_stats(col_idx).blocks - column_bases = self._second_order_measures.column_unweighted_bases.blocks body_selected_base = self._selected_columns_base(0) ins_selected_base = self._selected_columns_base(1) + col_bases = self._column_bases return [ [ - self._p_vals(t_stats[0][0], column_bases[0][0], body_selected_base), - self._p_vals(t_stats[0][1], column_bases[0][1], body_selected_base), + self._p_vals(t_stats[0][0], col_bases[0][0], body_selected_base), + self._p_vals(t_stats[0][1], col_bases[0][1], body_selected_base), ], [ - self._p_vals(t_stats[1][0], column_bases[1][0], ins_selected_base), - self._p_vals(t_stats[1][1], column_bases[1][1], ins_selected_base), + self._p_vals(t_stats[1][0], col_bases[1][0], ins_selected_base), + self._p_vals(t_stats[1][1], col_bases[1][1], ins_selected_base), ], ] @@ -1661,11 +1681,10 @@ def _selected_columns_base(self, table_index): don't have to broadcast. """ col_idx = self._selected_column_idx - column_bases = self._second_order_measures.column_unweighted_bases.blocks return ( - column_bases[table_index][1][:, [col_idx]] + self._column_bases[table_index][1][:, [col_idx]] if col_idx < 0 - else column_bases[table_index][0][:, [col_idx]] + else self._column_bases[table_index][0][:, [col_idx]] ) From 9141ed90ee4dc6426f749254abd82e222802dfb4 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Mon, 11 Dec 2023 18:20:00 +0100 Subject: [PATCH 3/3] [#186653738]: Fix unit tests --- src/cr/cube/matrix/measure.py | 2 +- tests/unit/matrix/test_measure.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cr/cube/matrix/measure.py b/src/cr/cube/matrix/measure.py index a4f62df94..37392d37e 100644 --- a/src/cr/cube/matrix/measure.py +++ b/src/cr/cube/matrix/measure.py @@ -1519,7 +1519,7 @@ def _base_values(self): # --- Use "body" reference values for base values (ref_props, ref_bases) = self._reference_values(0) return self._calculate_t_stats( - self._proportions[0][0], self._bases[0][0], ref_props, ref_bases + self._proportions[0][0], self._column_bases[0][0], ref_props, ref_bases ) @lazyproperty diff --git a/tests/unit/matrix/test_measure.py b/tests/unit/matrix/test_measure.py index 91684c574..446593226 100644 --- a/tests/unit/matrix/test_measure.py +++ b/tests/unit/matrix/test_measure.py @@ -907,7 +907,7 @@ def it_provides_the_bases_to_help(self, second_order_measures_): second_order_measures_.columns_squared_base.is_defined = False pairwise_tstat = _PairwiseSigTstats(None, second_order_measures_, None, None) - assert pairwise_tstat._bases == [1, 2] + assert pairwise_tstat._column_bases == [1, 2] def it_can_calculate_the_t_stat_to_help(self): pairwise_tstat = _PairwiseSigTstats(None, None, None, None) @@ -1039,7 +1039,7 @@ def it_provides_the_subtotal_rows_to_help( @pytest.fixture def _bases_prop_(self, request): - return property_mock(request, _PairwiseSigTstats, "_bases") + return property_mock(request, _PairwiseSigTstats, "_column_bases") @pytest.fixture def _calculate_t_stats_(self, request):