Skip to content

Commit

Permalink
Merge pull request #399 from Crunch-io/median-measure-187887894
Browse files Browse the repository at this point in the history
[#187887894]: Median measure
  • Loading branch information
ernestoarbitrio authored Jul 4, 2024
2 parents 5f52bdb + 63f9271 commit 73dd135
Show file tree
Hide file tree
Showing 25 changed files with 3,871 additions and 0 deletions.
46 changes: 46 additions & 0 deletions src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,15 @@ def means(self) -> Optional[np.ndarray]:
return None
return self._measures.means.raw_cube_array[self._valid_idxs].astype(np.float64)

@lazyproperty
def medians(self) -> Optional[np.ndarray]:
"""Optional float64 ndarray of the cube_medians if the measure exists."""
if self._measures.medians is None:
return None
return self._measures.medians.raw_cube_array[self._valid_idxs].astype(
np.float64
)

@lazyproperty
def missing(self) -> int:
"""Get missing count of a cube."""
Expand Down Expand Up @@ -781,6 +790,14 @@ def means(self) -> 'Optional["_MeanMeasure"]':
mean = _MeanMeasure(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
return None if mean.raw_cube_array is None else mean

@lazyproperty
def medians(self) -> 'Optional["_MediansMeasure"]':
"""Optional _MedianMeasure object providing access to means values."""
medians = _MediansMeasure(
self._cube_dict, self._all_dimensions, self._cube_idx_arg
)
return None if medians.raw_cube_array is None else medians

@lazyproperty
def missing_count(self) -> int:
"""numeric representing count of missing rows in cube response."""
Expand All @@ -790,6 +807,10 @@ def missing_count(self) -> int:
# fixtures that don't have valid_counts.
if self.means is not None:
return self.means.missing_count
# The check on the median measure is needed for retro-compatibility with the old
# fixtures that don't have valid_counts.
if self.medians is not None:
return self.medians.missing_count
return self._cube_dict["result"].get("missing", 0)

@lazyproperty
Expand Down Expand Up @@ -1047,6 +1068,31 @@ def _flat_values(self) -> Optional[np.ndarray]:
).flatten()


class _MediansMeasure(_BaseMeasure):
"""Statistical medians values from a cube-response."""

@lazyproperty
def missing_count(self) -> int:
"""Numeric value representing count of missing rows in response."""
return self._cube_dict["result"]["measures"]["median"].get("n_missing", 0)

@lazyproperty
def _flat_values(self) -> Optional[np.ndarray]:
"""Optional 1D np.ndarray of np.float64 median values as found in cube response.
Medians data may include missing items represented by a dict like
{'?': -1} in the cube response. These are replaced by np.nan in the
returned value.
"""
measure_payload = self._cube_dict["result"].get("measures", {}).get("median")
if measure_payload is None:
return None
data = np.array(measure_payload["data"]).flatten()
return np.array(
tuple(np.nan if isinstance(x, dict) else x for x in data), dtype=np.float64
).flatten()


class _OverlapMeasure(_BaseMeasure):
"""Overlap values from a cube-response."""

Expand Down
31 changes: 31 additions & 0 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,23 @@ def means(self):
"`.means` is undefined for a cube-result without a mean measure"
)

@lazyproperty
def medians(self):
"""2D optional np.float64 ndarray of median values for each table cell.
Cell value is `np.nan` for each cell corresponding to an inserted subtotal
(medians of addend cells cannot simply be added to get the mean of the
subtotal).
Raises `ValueError` if the cube-result does not include a median cube-measure.
"""
try:
return self._assemble_matrix(self._measures.medians.blocks)
except ValueError:
raise ValueError(
"`.medians` is undefined for a cube-result without a median measure"
)

@lazyproperty
def min_base_size_mask(self):
return MinBaseSizeMask(self, self._mask_size)
Expand Down Expand Up @@ -2005,6 +2022,20 @@ def means(self):
"`.means` is undefined for a cube-result without a mean measure"
)

@lazyproperty
def medians(self):
"""1D np.float64 ndarray of medians for each row of strand.
Raises ValueError when accessed on a cube-result that does not contain a median
cube-measure.
"""
try:
return self._assemble_vector(self._measures.medians.blocks)
except ValueError:
raise ValueError(
"`.medians` is undefined for a cube-result without a median measure"
)

@lazyproperty
def min_base_size_mask(self):
"""1D bool ndarray of True for each row that fails to meet min-base spec.
Expand Down
1 change: 1 addition & 0 deletions src/cr/cube/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ class CUBE_MEASURE(enum.Enum):
COVARIANCE = "covariance"
COUNT = "count"
MEAN = "mean"
MEDIAN = "median"
OVERLAP = "overlap"
STDDEV = "stddev"
SUM = "sum"
Expand Down
91 changes: 91 additions & 0 deletions src/cr/cube/matrix/cubemeasure.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ def cube_means(self):
"""_BaseCubeMeans subclass object for this cube-result."""
return _BaseCubeMeans.factory(self._cube, self._dimensions, self._slice_idx)

@lazyproperty
def cube_medians(self):
"""_BaseCubeMedian subclass object for this cube-result."""
return _BaseCubeMedians.factory(self._cube, self._dimensions, self._slice_idx)

@lazyproperty
def cube_overlaps(self):
"""_BaseCubeOverlaps subclass object for this cube-result."""
Expand Down Expand Up @@ -877,6 +882,92 @@ def means(self):
return self._means[:, 0, :, 0]


# === MEDIANs ===


class _BaseCubeMedians(_BaseCubeMeasure):
"""Base class for medians cube-measure variants."""

def __init__(self, dimensions, medians):
super(_BaseCubeMedians, self).__init__(dimensions)
self._medians = medians

@classmethod
def factory(cls, cube, dimensions, slice_idx):
"""Return _BaseCubeMedian subclass instance appropriate to `cube`.
Raises `ValueError` if the cube-result does not include a cube-median measure.
"""
if cube.medians is None:
raise ValueError("cube-result does not contain cube-median measure")
dimension_types = cube.dimension_types[-2:]
CubeMedianCls = (
_MrXMrCubeMedians
if dimension_types == (DT.MR, DT.MR)
else (
_MrXCatCubeMedians
if dimension_types[0] == DT.MR
else (
_CatXMrCubeMedians
if dimension_types[1] == DT.MR
else _CatXCatCubeMedians
)
)
)
return CubeMedianCls(
dimensions, cube.medians[cls._slice_idx_expr(cube, slice_idx)]
)

@lazyproperty
def medians(self):
"""2D np.float64 ndarray of cube medians."""
raise NotImplementedError( # pragma: no cover
f"`{type(self).__name__}` must implement `.medians`"
)


class _CatXCatCubeMedians(_BaseCubeMedians):
"""Medians cube-measure for a slice with no MR dimensions."""

@lazyproperty
def medians(self):
"""2D np.float64 ndarray of medians for each valid matrix cell."""
return self._medians


class _CatXMrCubeMedians(_BaseCubeMedians):
"""Medians cube-measure for a NOT_MR_X_MR slice.
Note that the rows-dimensions need not actually be CAT.
"""

@lazyproperty
def medians(self):
"""2D np.float64 ndarray of medians for each valid matrix cell."""
return self._medians[:, :, 0]


class _MrXCatCubeMedians(_BaseCubeMedians):
"""Medians cube-measure for an MR_X_NOT_MR slice.
Note that the columns-dimension need not actually be CAT.
"""

@lazyproperty
def medians(self):
"""2D np.float64 ndarray of medians for each valid matrix cell."""
return self._medians[:, 0, :]


class _MrXMrCubeMedians(_BaseCubeMedians):
"""Medians cube-measure for an MR_X_MR slice."""

@lazyproperty
def medians(self):
"""2D np.float64 ndarray of medians for each valid matrix cell."""
return self._medians[:, 0, :, 0]


# === OVERLAPS ===


Expand Down
16 changes: 16 additions & 0 deletions src/cr/cube/matrix/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,11 @@ def means(self):
"""_Means measure object for this cube-result"""
return _Means(self._dimensions, self, self._cube_measures)

@lazyproperty
def medians(self):
"""_Medians measure object for this cube-result"""
return _Medians(self._dimensions, self, self._cube_measures)

def pairwise_p_vals_for_subvar(self, subvar_idx):
"""_PairwiseSigPValsForSubvar measure object for this cube-result"""
return _PairwiseSigPValsForSubvar(
Expand Down Expand Up @@ -1177,6 +1182,17 @@ def blocks(self):
)


class _Medians(_BaseSecondOrderMeasure):
"""Provides the medians measure for a matrix."""

@lazyproperty
def blocks(self):
"""2D array of the four 2D "blocks" making up this measure."""
return NanSubtotals.blocks(
self._cube_measures.cube_medians.medians, self._dimensions
)


class _MeansSmoothed(_Means, _SmoothedMeasure):
"""Provides the smoothed mean measure for a matrix."""

Expand Down
56 changes: 56 additions & 0 deletions src/cr/cube/stripe/cubemeasure.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def cube_means(self):
"""_BaseCubeMeans subclass object for this stripe."""
return _BaseCubeMeans.factory(self._cube, self._rows_dimension)

@lazyproperty
def cube_medians(self):
"""_BaseCubeMedians subclass object for this stripe."""
return _BaseCubeMedians.factory(self._cube, self._rows_dimension)

@lazyproperty
def cube_stddev(self):
"""_BaseCubeStdDev subclass object for this stripe."""
Expand Down Expand Up @@ -253,6 +258,57 @@ def means(self):
return self._means[:, 0]


# === MEDIANs ===


class _BaseCubeMedians(_BaseCubeMeasure):
"""Base class for medians cube-measure variants."""

def __init__(self, rows_dimension, medians):
super(_BaseCubeMedians, self).__init__(rows_dimension)
self._medians = medians

@classmethod
def factory(cls, cube, rows_dimension):
"""Return _BaseCubeMedian subclass instance appropriate to `cube`."""
if cube.medians is None:
raise ValueError("cube-result does not contain cube-median measure")
MedianCls = (
_MrCubeMedians
if rows_dimension.dimension_type == DT.MR
else _CatCubeMedians
)
return MedianCls(rows_dimension, cube.medians)

@lazyproperty
def medians(self):
"""1D np.float64 ndarray of medians for each stripe row."""
raise NotImplementedError(
f"`{type(self).__name__}` must implement `.medians`"
) # pragma: no cover


class _CatCubeMedians(_BaseCubeMedians):
"""Medians cube-measure for a non-MR stripe."""

@lazyproperty
def medians(self):
"""1D np.float64 ndarray of medians for each stripe row."""
return self._medians


class _MrCubeMedians(_BaseCubeMedians):
"""Medians cube-measure for an MR stripe.
Its `.medians` is a 2D ndarray with axes (rows, sel/not).
"""

@lazyproperty
def medians(self):
"""1D np.float64 ndarray of medians for each stripe row."""
return self._medians[:, 0]


# === STD DEV ===


Expand Down
26 changes: 26 additions & 0 deletions src/cr/cube/stripe/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ def means(self):
"""_Means measure object for this stripe."""
return _Means(self._rows_dimension, self, self._cube_measures)

@lazyproperty
def medians(self):
"""_Medians measure object for this stripe."""
return _Medians(self._rows_dimension, self, self._cube_measures)

@lazyproperty
def population_proportions(self):
"""_PopulationPrortion measure object for this stripe."""
Expand Down Expand Up @@ -231,6 +236,27 @@ def subtotal_values(self):
return NanSubtotals.subtotal_values(self.base_values, self._rows_dimension)


class _Medians(_BaseSecondOrderMeasure):
"""Provides the medians measure for a stripe.
Relies on the presence of a medians cube-measure in the cube-result.
"""

@lazyproperty
def base_values(self):
"""1D np.float64 ndarray of medians for each row."""
return self._cube_measures.cube_medians.medians

@lazyproperty
def subtotal_values(self):
"""1D ndarray of np.nan for each row-subtotal.
Medians values cannot be subtotaled and each subtotal value is unconditionally
np.nan.
"""
return NanSubtotals.subtotal_values(self.base_values, self._rows_dimension)


class _MeansSmoothed(_Means, _SmoothedMeasure):
"""Provides the smoothed means measure for a stripe.
Expand Down
Loading

0 comments on commit 73dd135

Please sign in to comment.