Skip to content

Commit

Permalink
Merge pull request #394 from Crunch-io/memory-leakage-deepcopy-186966946
Browse files Browse the repository at this point in the history
remove deepcopy from cube response
  • Loading branch information
ernestoarbitrio authored Feb 4, 2024
2 parents 049c0e3 + 18d28f3 commit 990ec99
Show file tree
Hide file tree
Showing 15 changed files with 169 additions and 185 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ repos:
- id: trailing-whitespace

- repo: https://github.com/ambv/black
rev: 22.6.0
rev: 24.1.1
hooks:
- id: black

Expand Down
47 changes: 21 additions & 26 deletions src/cr/cube/cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
CubeSet is the main API class for manipulating Crunch.io JSON cube responses.
"""

import copy
import json
from typing import Dict, FrozenSet, Iterator, List, Optional, Tuple, Union

Expand Down Expand Up @@ -260,11 +259,15 @@ def counts_with_missings(self) -> np.ndarray:
return (
self._measures.weighted_valid_counts.raw_cube_array
if self._measures.weighted_valid_counts is not None
else self._measures.unweighted_valid_counts.raw_cube_array
if self._measures.unweighted_valid_counts is not None
else self._measures.weighted_counts.raw_cube_array
if self.has_weighted_counts
else self._measures.unweighted_counts.raw_cube_array
else (
self._measures.unweighted_valid_counts.raw_cube_array
if self._measures.unweighted_valid_counts is not None
else (
self._measures.weighted_counts.raw_cube_array
if self.has_weighted_counts
else self._measures.unweighted_counts.raw_cube_array
)
)
)

@lazyproperty
Expand Down Expand Up @@ -312,8 +315,10 @@ def inflate(self) -> "Cube":
A multi-cube (tabbook) response formed from a function (e.g. mean()) on
a numeric variable arrives without a rows-dimension.
"""
cube_dict = self._cube_dict
dimensions = cube_dict["result"]["dimensions"]
cube_dict = self._cube_response
num_array_dim = self._numeric_array_dimension or None
dims = cube_dict["result"]["dimensions"]
dimensions = [num_array_dim] + dims if num_array_dim else dims
default_name = "-".join([m.value for m in self._available_numeric_measures])
# --- The default value in case of numeric variable is the combination of all
# --- the measures expressed in the cube response.
Expand Down Expand Up @@ -435,7 +440,7 @@ def title(self) -> str:
use-case it is a stand-in for the columns-dimension name since a strand has no
columns dimension.
"""
return self._cube_dict["result"].get("title", "Untitled")
return self._cube_response["result"].get("title", "Untitled")

@lazyproperty
def unweighted_counts(self) -> np.ndarray:
Expand Down Expand Up @@ -533,9 +538,12 @@ def weighted_squared_counts(self) -> Optional[np.ndarray]:
].astype(np.float64)

@lazyproperty
def _all_dimensions(self) -> list:
def _all_dimensions(self) -> Dimensions:
"""List of all dimensions (not just user-apparent ones) for this cube."""
return Dimensions.from_dicts(self._cube_dict["result"]["dimensions"])
num_array_dim = self._numeric_array_dimension or None
dims = self._cube_response["result"]["dimensions"]
dimension_dicts = [num_array_dim] + dims if num_array_dim else dims
return Dimensions.from_dicts(dimension_dicts)

@lazyproperty
def _available_numeric_measures(self) -> Tuple[CUBE_MEASURE, ...]:
Expand All @@ -560,19 +568,6 @@ def _ca_as_0th(self) -> bool:
and self.dimension_types[0] == DT.CA
)

@lazyproperty
def _cube_dict(self) -> Dict:
"""dict containing raw cube response, parsed from JSON payload."""
cube_dict = copy.deepcopy(self._cube_response)
if self._numeric_measure_subvariables:
dimensions = cube_dict.get("result", {}).get("dimensions", [])
# ---dim inflation---
# ---In case of numeric arrays, we need to inflate the row dimension
# ---according to the mean subvariables. For each subvar the row dimension
# ---will have a new element related to the subvar metadata.
dimensions.insert(0, self._numeric_array_dimension)
return cube_dict

@lazyproperty
def _cube_response(self) -> Dict:
"""dict representing the parsed cube response arguments."""
Expand All @@ -593,7 +588,7 @@ def _cube_response(self) -> Dict:
@lazyproperty
def _is_single_filter_col_cube(self) -> float:
"""bool determines if it is a single column filter cube."""
return self._cube_dict["result"].get("is_single_col_cube", False)
return self._cube_response["result"].get("is_single_col_cube", False)

@lazyproperty
def _measures(self) -> "_Measures":
Expand All @@ -602,7 +597,7 @@ def _measures(self) -> "_Measures":
Provides access to count based measures and numeric measures (e.g. mean, sum)
when available.
"""
return _Measures(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
return _Measures(self._cube_response, self._all_dimensions, self._cube_idx_arg)

@lazyproperty
def _numeric_measure_references(self) -> Dict:
Expand Down
10 changes: 6 additions & 4 deletions src/cr/cube/cubepart.py
Original file line number Diff line number Diff line change
Expand Up @@ -1736,10 +1736,12 @@ def _assemble_vector(self, base_vector, subtotals, order, diffs_nan=False):
# values from a _BaseSubtotals subclass.
vector_subtotals = np.array(
[
np.nan
if diffs_nan and len(subtotal.subtrahend_idxs) > 0
else np.sum(base_vector[subtotal.addend_idxs])
- np.sum(base_vector[subtotal.subtrahend_idxs])
(
np.nan
if diffs_nan and len(subtotal.subtrahend_idxs) > 0
else np.sum(base_vector[subtotal.addend_idxs])
- np.sum(base_vector[subtotal.subtrahend_idxs])
)
for subtotal in subtotals
]
)
Expand Down
84 changes: 40 additions & 44 deletions src/cr/cube/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import copy
from collections.abc import Sequence
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union
from functools import partial
from typing import Callable, Dict, List, Optional, Tuple, Union

import numpy as np

Expand All @@ -17,6 +17,38 @@
)
from cr.cube.util import lazyproperty

from .util import format, format_datetime

DATETIME_FORMATS = {
"Y": "%Y",
"Q": "%Y-%m",
"3M": "%Y-%m",
"M": "%Y-%m",
"W": "%Y-%m-%d",
"D": "%Y-%m-%d",
"h": "%Y-%m-%dT%H",
"m": "%Y-%m-%dT%H:%M",
"s": "%Y-%m-%dT%H:%M:%S",
"ms": "%Y-%m-%dT%H:%M:%S.%f",
"us": "%Y-%m-%dT%H:%M:%S.%f",
}


def _formatter(dimension_type, typedef, out_format) -> Union[Callable, partial]:
"""Returns a formatting function according to the dimension type."""

if dimension_type != DT.DATETIME:
formatter: Union[Callable, partial] = format
else:
resolution = typedef["subtype"].get("resolution")
orig_format: str = DATETIME_FORMATS.get(resolution) or ""
formatter = (
partial(format_datetime, orig_format=orig_format, out_format=out_format)
if orig_format and out_format
else format
)
return formatter


class Dimensions(tuple):
"""Collection containing every dimension defined in cube response."""
Expand Down Expand Up @@ -460,20 +492,6 @@ class Elements(tuple):
Each element is either a category or a subvariable.
"""

datetime_formats = {
"Y": "%Y",
"Q": "%Y-%m",
"3M": "%Y-%m",
"M": "%Y-%m",
"W": "%Y-%m-%d",
"D": "%Y-%m-%d",
"h": "%Y-%m-%dT%H",
"m": "%Y-%m-%dT%H:%M",
"s": "%Y-%m-%dT%H:%M:%S",
"ms": "%Y-%m-%dT%H:%M:%S.%f",
"us": "%Y-%m-%dT%H:%M:%S.%f",
}

@classmethod
def from_typedef(
cls, typedef, dimension_transforms_dict, dimension_type, element_data_format
Expand Down Expand Up @@ -509,31 +527,7 @@ def from_typedef(
xforms = _ElementTransforms(
all_xforms.get(element_id, all_xforms.get(str(element_id), {}))
)

# This is so dumb that our type checker won't let us just
# write `formatter = str`.
def format(x) -> str:
return str(x)

formatter = format

if dimension_type == DT.DATETIME:
orig_format = cls.datetime_formats.get(
typedef["subtype"].get("resolution")
)
out_format = element_data_format
if orig_format is not None and out_format is not None:

def format_datetime(x) -> str:
try:
return datetime.strptime(x, orig_format).strftime(
out_format
)
except ValueError:
return str(x)

formatter = format_datetime

formatter = _formatter(dimension_type, typedef, element_data_format)
element = Element(element_dict, idx, xforms, formatter)
elements.append(element)

Expand Down Expand Up @@ -842,9 +836,11 @@ def _replaced_element_transforms(self, element_transforms) -> Dict:
if key == "subvar_id":
# --- translate from subvariable id
new_keys = tuple(
self._subvar_aliases[self._subvar_ids.index(_id)]
if _id in self._subvar_ids
else None
(
self._subvar_aliases[self._subvar_ids.index(_id)]
if _id in self._subvar_ids
else None
)
for _id in old_keys
)
else:
Expand Down
27 changes: 18 additions & 9 deletions src/cr/cube/matrix/assembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,24 @@ def row_display_order(cls, dimensions, second_order_measures, format):
HelperCls = (
_SortRowsByBaseColumnHelper
if collation_method == CM.OPPOSING_ELEMENT
else _SortRowsByDerivedColumnHelper
if collation_method == CM.OPPOSING_INSERTION and dim_type in DT.ARRAY_TYPES
else _SortRowsByInsertedColumnHelper
if collation_method == CM.OPPOSING_INSERTION
else _SortRowsByLabelHelper
if collation_method == CM.LABEL
else _SortRowsByMarginalHelper
if collation_method == CM.MARGINAL
else _RowOrderHelper
else (
_SortRowsByDerivedColumnHelper
if collation_method == CM.OPPOSING_INSERTION
and dim_type in DT.ARRAY_TYPES
else (
_SortRowsByInsertedColumnHelper
if collation_method == CM.OPPOSING_INSERTION
else (
_SortRowsByLabelHelper
if collation_method == CM.LABEL
else (
_SortRowsByMarginalHelper
if collation_method == CM.MARGINAL
else _RowOrderHelper
)
)
)
)
)
return HelperCls(dimensions, second_order_measures, format)._display_order

Expand Down
64 changes: 39 additions & 25 deletions src/cr/cube/matrix/cubemeasure.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@ def factory(cls, counts, diff_nans, cube, dimensions, slice_idx):
Chooses between unweighted and weighted counts based on `type`.
"""
dimension_type_strings = tuple(
"MR"
if dim_type == DT.MR
else "ARR"
if dim_type in DT.ARRAY_TYPES
else "CAT"
(
"MR"
if dim_type == DT.MR
else "ARR" if dim_type in DT.ARRAY_TYPES else "CAT"
)
for dim_type in cube.dimension_types[-2:]
)
CubeCountsCls = {
Expand Down Expand Up @@ -812,11 +812,15 @@ def factory(cls, cube, dimensions, slice_idx):
CubeMeansCls = (
_MrXMrCubeMeans
if dimension_types == (DT.MR, DT.MR)
else _MrXCatCubeMeans
if dimension_types[0] == DT.MR
else _CatXMrCubeMeans
if dimension_types[1] == DT.MR
else _CatXCatCubeMeans
else (
_MrXCatCubeMeans
if dimension_types[0] == DT.MR
else (
_CatXMrCubeMeans
if dimension_types[1] == DT.MR
else _CatXCatCubeMeans
)
)
)
return CubeMeansCls(
dimensions, cube.means[cls._slice_idx_expr(cube, slice_idx)]
Expand Down Expand Up @@ -1027,11 +1031,15 @@ def factory(cls, cube, dimensions, slice_idx):
CubeSumsCls = (
_MrXMrCubeStdDev
if dimension_types == (DT.MR, DT.MR)
else _MrXCatCubeStdDev
if dimension_types[0] == DT.MR
else _CatXMrCubeStdDev
if dimension_types[1] == DT.MR
else _CatXCatCubeStdDev
else (
_MrXCatCubeStdDev
if dimension_types[0] == DT.MR
else (
_CatXMrCubeStdDev
if dimension_types[1] == DT.MR
else _CatXCatCubeStdDev
)
)
)
return CubeSumsCls(
dimensions, cube.stddev[cls._slice_idx_expr(cube, slice_idx)]
Expand Down Expand Up @@ -1110,11 +1118,13 @@ def factory(cls, cube, dimensions, slice_idx):
CubeSumsCls = (
_MrXMrCubeSums
if dimension_types == (DT.MR, DT.MR)
else _MrXCatCubeSums
if dimension_types[0] == DT.MR
else _CatXMrCubeSums
if dimension_types[1] == DT.MR
else _CatXCatCubeSums
else (
_MrXCatCubeSums
if dimension_types[0] == DT.MR
else (
_CatXMrCubeSums if dimension_types[1] == DT.MR else _CatXCatCubeSums
)
)
)
return CubeSumsCls(dimensions, cube.sums[cls._slice_idx_expr(cube, slice_idx)])

Expand Down Expand Up @@ -1202,11 +1212,15 @@ def factory(cls, cube, dimensions, slice_idx):
UnconditionalCubeCountsCls = (
_MrXMrUnconditionalCubeCounts
if dimension_types == (DT.MR, DT.MR)
else _MrXCatUnconditionalCubeCounts
if dimension_types[0] == DT.MR
else _CatXMrUnconditionalCubeCounts
if dimension_types[1] == DT.MR
else _CatXCatUnconditionalCubeCounts
else (
_MrXCatUnconditionalCubeCounts
if dimension_types[0] == DT.MR
else (
_CatXMrUnconditionalCubeCounts
if dimension_types[1] == DT.MR
else _CatXCatUnconditionalCubeCounts
)
)
)
return UnconditionalCubeCountsCls(
dimensions,
Expand Down
Loading

0 comments on commit 990ec99

Please sign in to comment.