Skip to content

Commit

Permalink
Merge pull request #392 from Crunch-io/use-effective-weights-in-df-ca…
Browse files Browse the repository at this point in the history
…lculation-186653738

[#186653738] Enhanced Degrees of Freedom Calculation with Squared Weighted Counts
  • Loading branch information
slobodan-ilic authored Dec 11, 2023
2 parents 07cb337 + 9141ed9 commit ee39d56
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 18 deletions.
51 changes: 35 additions & 16 deletions src/cr/cube/matrix/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -1519,12 +1519,32 @@ def _base_values(self):
# --- Use "body" reference values for base values
(ref_props, ref_bases) = self._reference_values(0)
return self._calculate_t_stats(
self._proportions[0][0], self._bases[0][0], ref_props, ref_bases
self._proportions[0][0], self._column_bases[0][0], ref_props, ref_bases
)

@lazyproperty
def _bases(self):
"""2D array of 2D ndarray "blocks" for the column unweighted bases"""
def _column_bases(self):
"""
Calculate and return the 2D array of 2D ndarray "blocks" representing the
column bases for analysis. These bases are determined based on the presence
or absence of the 'squared weighted counts'.
The method first checks if the 'squared weighted counts' measure is defined.
If it is, the method calculates the 'effective' counts. These are obtained by
squaring the unweighted counts and then dividing each by the corresponding
squared count. This calculation reflects the 'effective' sample size when
weighting is applied.
If the 'squared weighted counts' measure does not exist, the standard
unweighted counts are used. These counts are represented as a simple 2D array
of 2D ndarray blocks without any modification, directly reflecting the raw,
unweighted counts.
Returns:
numpy.ndarray: A 2D array of 2D ndarray counts, representing the
calculated column bases (either 'effective' or unweighted counts) for
the analysis.
"""
unweighted_blocks = self._second_order_measures.column_unweighted_bases.blocks
if self._second_order_measures.columns_squared_base.is_defined:
squared_blocks = self._second_order_measures.column_squared_bases.blocks
Expand Down Expand Up @@ -1556,10 +1576,10 @@ def _reference_values(self, block_index):
col_idx = self._selected_column_idx
if col_idx < 0:
props = self._proportions[block_index][1]
bases = self._bases[block_index][1]
bases = self._column_bases[block_index][1]
else:
props = self._proportions[block_index][0]
bases = self._bases[block_index][0]
bases = self._column_bases[block_index][0]

return (props[:, [col_idx]], bases[:, [col_idx]])

Expand All @@ -1586,7 +1606,7 @@ def _intersections(self):
# --- Use "inserted" reference values for intersections
(ref_props, ref_variance) = self._reference_values(1)
return self._calculate_t_stats(
self._proportions[1][1], self._bases[1][1], ref_props, ref_variance
self._proportions[1][1], self._column_bases[1][1], ref_props, ref_variance
)

@lazyproperty
Expand All @@ -1600,7 +1620,7 @@ def _subtotal_columns(self):
# --- Use "body" reference values for inserted columns
(ref_props, ref_variance) = self._reference_values(0)
return self._calculate_t_stats(
self._proportions[0][1], self._bases[0][1], ref_props, ref_variance
self._proportions[0][1], self._column_bases[0][1], ref_props, ref_variance
)

@lazyproperty
Expand All @@ -1609,7 +1629,7 @@ def _subtotal_rows(self):
# --- Use "inserted" reference values for inserted rows
(ref_props, ref_variance) = self._reference_values(1)
return self._calculate_t_stats(
self._proportions[1][0], self._bases[1][0], ref_props, ref_variance
self._proportions[1][0], self._column_bases[1][0], ref_props, ref_variance
)


Expand All @@ -1624,18 +1644,18 @@ def blocks(self):
"""2D array of the four 2D "blocks" making up this measure."""
col_idx = self._selected_column_idx
t_stats = self._second_order_measures.pairwise_t_stats(col_idx).blocks
column_bases = self._second_order_measures.column_unweighted_bases.blocks
body_selected_base = self._selected_columns_base(0)
ins_selected_base = self._selected_columns_base(1)
col_bases = self._column_bases

return [
[
self._p_vals(t_stats[0][0], column_bases[0][0], body_selected_base),
self._p_vals(t_stats[0][1], column_bases[0][1], body_selected_base),
self._p_vals(t_stats[0][0], col_bases[0][0], body_selected_base),
self._p_vals(t_stats[0][1], col_bases[0][1], body_selected_base),
],
[
self._p_vals(t_stats[1][0], column_bases[1][0], ins_selected_base),
self._p_vals(t_stats[1][1], column_bases[1][1], ins_selected_base),
self._p_vals(t_stats[1][0], col_bases[1][0], ins_selected_base),
self._p_vals(t_stats[1][1], col_bases[1][1], ins_selected_base),
],
]

Expand All @@ -1661,11 +1681,10 @@ def _selected_columns_base(self, table_index):
don't have to broadcast.
"""
col_idx = self._selected_column_idx
column_bases = self._second_order_measures.column_unweighted_bases.blocks
return (
column_bases[table_index][1][:, [col_idx]]
self._column_bases[table_index][1][:, [col_idx]]
if col_idx < 0
else column_bases[table_index][0][:, [col_idx]]
else self._column_bases[table_index][0][:, [col_idx]]
)


Expand Down
7 changes: 7 additions & 0 deletions tests/integration/test_cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -1907,6 +1907,13 @@ def it_uses_squared_weights_for_effect_calculation(self):
[NA, 0.0, 0.0, -0.9486833, NA, NA, NA, 1.8973666, -0.9486833, NA],
],
)
np.testing.assert_almost_equal(
slice_._measures.pairwise_p_vals(1).blocks[0][0],
[
[NA, 1.0, 1.0, 0.66381998, NA, NA, NA, 0.19510957, 0.66381998, NA],
[NA, 1.0, 1.0, 0.66381998, NA, NA, NA, 0.19510957, 0.66381998, NA],
],
)


class Describe_Strand:
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/matrix/test_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,7 @@ def it_provides_the_bases_to_help(self, second_order_measures_):
second_order_measures_.columns_squared_base.is_defined = False
pairwise_tstat = _PairwiseSigTstats(None, second_order_measures_, None, None)

assert pairwise_tstat._bases == [1, 2]
assert pairwise_tstat._column_bases == [1, 2]

def it_can_calculate_the_t_stat_to_help(self):
pairwise_tstat = _PairwiseSigTstats(None, None, None, None)
Expand Down Expand Up @@ -1039,7 +1039,7 @@ def it_provides_the_subtotal_rows_to_help(

@pytest.fixture
def _bases_prop_(self, request):
return property_mock(request, _PairwiseSigTstats, "_bases")
return property_mock(request, _PairwiseSigTstats, "_column_bases")

@pytest.fixture
def _calculate_t_stats_(self, request):
Expand Down

0 comments on commit ee39d56

Please sign in to comment.