Merge pull request #394 from Crunch-io/memory-leakage-deepcopy-186966946

remove deepcopy from cube response
Crunch-io · Feb 4, 2024 · 990ec99 · 990ec99
2 parents 049c0e3 + 18d28f3
commit 990ec99
Show file tree

Hide file tree

Showing 15 changed files with 169 additions and 185 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
       - id: trailing-whitespace
 
   - repo: https://github.com/ambv/black
-    rev: 22.6.0
+    rev: 24.1.1
     hooks:
       - id: black
 

diff --git a/src/cr/cube/cube.py b/src/cr/cube/cube.py
@@ -5,7 +5,6 @@
 CubeSet is the main API class for manipulating Crunch.io JSON cube responses.
 """
 
-import copy
 import json
 from typing import Dict, FrozenSet, Iterator, List, Optional, Tuple, Union
 
@@ -260,11 +259,15 @@ def counts_with_missings(self) -> np.ndarray:
         return (
             self._measures.weighted_valid_counts.raw_cube_array
             if self._measures.weighted_valid_counts is not None
-            else self._measures.unweighted_valid_counts.raw_cube_array
-            if self._measures.unweighted_valid_counts is not None
-            else self._measures.weighted_counts.raw_cube_array
-            if self.has_weighted_counts
-            else self._measures.unweighted_counts.raw_cube_array
+            else (
+                self._measures.unweighted_valid_counts.raw_cube_array
+                if self._measures.unweighted_valid_counts is not None
+                else (
+                    self._measures.weighted_counts.raw_cube_array
+                    if self.has_weighted_counts
+                    else self._measures.unweighted_counts.raw_cube_array
+                )
+            )
         )
 
     @lazyproperty
@@ -312,8 +315,10 @@ def inflate(self) -> "Cube":
         A multi-cube (tabbook) response formed from a function (e.g. mean()) on
         a numeric variable arrives without a rows-dimension.
         """
-        cube_dict = self._cube_dict
-        dimensions = cube_dict["result"]["dimensions"]
+        cube_dict = self._cube_response
+        num_array_dim = self._numeric_array_dimension or None
+        dims = cube_dict["result"]["dimensions"]
+        dimensions = [num_array_dim] + dims if num_array_dim else dims
         default_name = "-".join([m.value for m in self._available_numeric_measures])
         # --- The default value in case of numeric variable is the combination of all
         # --- the measures expressed in the cube response.
@@ -435,7 +440,7 @@ def title(self) -> str:
         use-case it is a stand-in for the columns-dimension name since a strand has no
         columns dimension.
         """
-        return self._cube_dict["result"].get("title", "Untitled")
+        return self._cube_response["result"].get("title", "Untitled")
 
     @lazyproperty
     def unweighted_counts(self) -> np.ndarray:
@@ -533,9 +538,12 @@ def weighted_squared_counts(self) -> Optional[np.ndarray]:
         ].astype(np.float64)
 
     @lazyproperty
-    def _all_dimensions(self) -> list:
+    def _all_dimensions(self) -> Dimensions:
         """List of all dimensions (not just user-apparent ones) for this cube."""
-        return Dimensions.from_dicts(self._cube_dict["result"]["dimensions"])
+        num_array_dim = self._numeric_array_dimension or None
+        dims = self._cube_response["result"]["dimensions"]
+        dimension_dicts = [num_array_dim] + dims if num_array_dim else dims
+        return Dimensions.from_dicts(dimension_dicts)
 
     @lazyproperty
     def _available_numeric_measures(self) -> Tuple[CUBE_MEASURE, ...]:
@@ -560,19 +568,6 @@ def _ca_as_0th(self) -> bool:
             and self.dimension_types[0] == DT.CA
         )
 
-    @lazyproperty
-    def _cube_dict(self) -> Dict:
-        """dict containing raw cube response, parsed from JSON payload."""
-        cube_dict = copy.deepcopy(self._cube_response)
-        if self._numeric_measure_subvariables:
-            dimensions = cube_dict.get("result", {}).get("dimensions", [])
-            # ---dim inflation---
-            # ---In case of numeric arrays, we need to inflate the row dimension
-            # ---according to the mean subvariables. For each subvar the row dimension
-            # ---will have a new element related to the subvar metadata.
-            dimensions.insert(0, self._numeric_array_dimension)
-        return cube_dict
-
     @lazyproperty
     def _cube_response(self) -> Dict:
         """dict representing the parsed cube response arguments."""
@@ -593,7 +588,7 @@ def _cube_response(self) -> Dict:
     @lazyproperty
     def _is_single_filter_col_cube(self) -> float:
         """bool determines if it is a single column filter cube."""
-        return self._cube_dict["result"].get("is_single_col_cube", False)
+        return self._cube_response["result"].get("is_single_col_cube", False)
 
     @lazyproperty
     def _measures(self) -> "_Measures":
@@ -602,7 +597,7 @@ def _measures(self) -> "_Measures":
         Provides access to count based measures and numeric measures (e.g. mean, sum)
         when available.
         """
-        return _Measures(self._cube_dict, self._all_dimensions, self._cube_idx_arg)
+        return _Measures(self._cube_response, self._all_dimensions, self._cube_idx_arg)
 
     @lazyproperty
     def _numeric_measure_references(self) -> Dict:

diff --git a/src/cr/cube/cubepart.py b/src/cr/cube/cubepart.py
@@ -1736,10 +1736,12 @@ def _assemble_vector(self, base_vector, subtotals, order, diffs_nan=False):
         # values from a _BaseSubtotals subclass.
         vector_subtotals = np.array(
             [
-                np.nan
-                if diffs_nan and len(subtotal.subtrahend_idxs) > 0
-                else np.sum(base_vector[subtotal.addend_idxs])
-                - np.sum(base_vector[subtotal.subtrahend_idxs])
+                (
+                    np.nan
+                    if diffs_nan and len(subtotal.subtrahend_idxs) > 0
+                    else np.sum(base_vector[subtotal.addend_idxs])
+                    - np.sum(base_vector[subtotal.subtrahend_idxs])
+                )
                 for subtotal in subtotals
             ]
         )

diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py
@@ -4,8 +4,8 @@
 
 import copy
 from collections.abc import Sequence
-from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -17,6 +17,38 @@
 )
 from cr.cube.util import lazyproperty
 
+from .util import format, format_datetime
+
+DATETIME_FORMATS = {
+    "Y": "%Y",
+    "Q": "%Y-%m",
+    "3M": "%Y-%m",
+    "M": "%Y-%m",
+    "W": "%Y-%m-%d",
+    "D": "%Y-%m-%d",
+    "h": "%Y-%m-%dT%H",
+    "m": "%Y-%m-%dT%H:%M",
+    "s": "%Y-%m-%dT%H:%M:%S",
+    "ms": "%Y-%m-%dT%H:%M:%S.%f",
+    "us": "%Y-%m-%dT%H:%M:%S.%f",
+}
+
+
+def _formatter(dimension_type, typedef, out_format) -> Union[Callable, partial]:
+    """Returns a formatting function according to the dimension type."""
+
+    if dimension_type != DT.DATETIME:
+        formatter: Union[Callable, partial] = format
+    else:
+        resolution = typedef["subtype"].get("resolution")
+        orig_format: str = DATETIME_FORMATS.get(resolution) or ""
+        formatter = (
+            partial(format_datetime, orig_format=orig_format, out_format=out_format)
+            if orig_format and out_format
+            else format
+        )
+    return formatter
+
 
 class Dimensions(tuple):
     """Collection containing every dimension defined in cube response."""
@@ -460,20 +492,6 @@ class Elements(tuple):
     Each element is either a category or a subvariable.
     """
 
-    datetime_formats = {
-        "Y": "%Y",
-        "Q": "%Y-%m",
-        "3M": "%Y-%m",
-        "M": "%Y-%m",
-        "W": "%Y-%m-%d",
-        "D": "%Y-%m-%d",
-        "h": "%Y-%m-%dT%H",
-        "m": "%Y-%m-%dT%H:%M",
-        "s": "%Y-%m-%dT%H:%M:%S",
-        "ms": "%Y-%m-%dT%H:%M:%S.%f",
-        "us": "%Y-%m-%dT%H:%M:%S.%f",
-    }
-
     @classmethod
     def from_typedef(
         cls, typedef, dimension_transforms_dict, dimension_type, element_data_format
@@ -509,31 +527,7 @@ def from_typedef(
             xforms = _ElementTransforms(
                 all_xforms.get(element_id, all_xforms.get(str(element_id), {}))
             )
-
-            # This is so dumb that our type checker won't let us just
-            # write `formatter = str`.
-            def format(x) -> str:
-                return str(x)
-
-            formatter = format
-
-            if dimension_type == DT.DATETIME:
-                orig_format = cls.datetime_formats.get(
-                    typedef["subtype"].get("resolution")
-                )
-                out_format = element_data_format
-                if orig_format is not None and out_format is not None:
-
-                    def format_datetime(x) -> str:
-                        try:
-                            return datetime.strptime(x, orig_format).strftime(
-                                out_format
-                            )
-                        except ValueError:
-                            return str(x)
-
-                    formatter = format_datetime
-
+            formatter = _formatter(dimension_type, typedef, element_data_format)
             element = Element(element_dict, idx, xforms, formatter)
             elements.append(element)
 
@@ -842,9 +836,11 @@ def _replaced_element_transforms(self, element_transforms) -> Dict:
         if key == "subvar_id":
             # --- translate from subvariable id
             new_keys = tuple(
-                self._subvar_aliases[self._subvar_ids.index(_id)]
-                if _id in self._subvar_ids
-                else None
+                (
+                    self._subvar_aliases[self._subvar_ids.index(_id)]
+                    if _id in self._subvar_ids
+                    else None
+                )
                 for _id in old_keys
             )
         else:

diff --git a/src/cr/cube/matrix/assembler.py b/src/cr/cube/matrix/assembler.py
@@ -64,15 +64,24 @@ def row_display_order(cls, dimensions, second_order_measures, format):
         HelperCls = (
             _SortRowsByBaseColumnHelper
             if collation_method == CM.OPPOSING_ELEMENT
-            else _SortRowsByDerivedColumnHelper
-            if collation_method == CM.OPPOSING_INSERTION and dim_type in DT.ARRAY_TYPES
-            else _SortRowsByInsertedColumnHelper
-            if collation_method == CM.OPPOSING_INSERTION
-            else _SortRowsByLabelHelper
-            if collation_method == CM.LABEL
-            else _SortRowsByMarginalHelper
-            if collation_method == CM.MARGINAL
-            else _RowOrderHelper
+            else (
+                _SortRowsByDerivedColumnHelper
+                if collation_method == CM.OPPOSING_INSERTION
+                and dim_type in DT.ARRAY_TYPES
+                else (
+                    _SortRowsByInsertedColumnHelper
+                    if collation_method == CM.OPPOSING_INSERTION
+                    else (
+                        _SortRowsByLabelHelper
+                        if collation_method == CM.LABEL
+                        else (
+                            _SortRowsByMarginalHelper
+                            if collation_method == CM.MARGINAL
+                            else _RowOrderHelper
+                        )
+                    )
+                )
+            )
         )
         return HelperCls(dimensions, second_order_measures, format)._display_order
 

diff --git a/src/cr/cube/matrix/cubemeasure.py b/src/cr/cube/matrix/cubemeasure.py
@@ -131,11 +131,11 @@ def factory(cls, counts, diff_nans, cube, dimensions, slice_idx):
         Chooses between unweighted and weighted counts based on `type`.
         """
         dimension_type_strings = tuple(
-            "MR"
-            if dim_type == DT.MR
-            else "ARR"
-            if dim_type in DT.ARRAY_TYPES
-            else "CAT"
+            (
+                "MR"
+                if dim_type == DT.MR
+                else "ARR" if dim_type in DT.ARRAY_TYPES else "CAT"
+            )
             for dim_type in cube.dimension_types[-2:]
         )
         CubeCountsCls = {
@@ -812,11 +812,15 @@ def factory(cls, cube, dimensions, slice_idx):
         CubeMeansCls = (
             _MrXMrCubeMeans
             if dimension_types == (DT.MR, DT.MR)
-            else _MrXCatCubeMeans
-            if dimension_types[0] == DT.MR
-            else _CatXMrCubeMeans
-            if dimension_types[1] == DT.MR
-            else _CatXCatCubeMeans
+            else (
+                _MrXCatCubeMeans
+                if dimension_types[0] == DT.MR
+                else (
+                    _CatXMrCubeMeans
+                    if dimension_types[1] == DT.MR
+                    else _CatXCatCubeMeans
+                )
+            )
         )
         return CubeMeansCls(
             dimensions, cube.means[cls._slice_idx_expr(cube, slice_idx)]
@@ -1027,11 +1031,15 @@ def factory(cls, cube, dimensions, slice_idx):
         CubeSumsCls = (
             _MrXMrCubeStdDev
             if dimension_types == (DT.MR, DT.MR)
-            else _MrXCatCubeStdDev
-            if dimension_types[0] == DT.MR
-            else _CatXMrCubeStdDev
-            if dimension_types[1] == DT.MR
-            else _CatXCatCubeStdDev
+            else (
+                _MrXCatCubeStdDev
+                if dimension_types[0] == DT.MR
+                else (
+                    _CatXMrCubeStdDev
+                    if dimension_types[1] == DT.MR
+                    else _CatXCatCubeStdDev
+                )
+            )
         )
         return CubeSumsCls(
             dimensions, cube.stddev[cls._slice_idx_expr(cube, slice_idx)]
@@ -1110,11 +1118,13 @@ def factory(cls, cube, dimensions, slice_idx):
         CubeSumsCls = (
             _MrXMrCubeSums
             if dimension_types == (DT.MR, DT.MR)
-            else _MrXCatCubeSums
-            if dimension_types[0] == DT.MR
-            else _CatXMrCubeSums
-            if dimension_types[1] == DT.MR
-            else _CatXCatCubeSums
+            else (
+                _MrXCatCubeSums
+                if dimension_types[0] == DT.MR
+                else (
+                    _CatXMrCubeSums if dimension_types[1] == DT.MR else _CatXCatCubeSums
+                )
+            )
         )
         return CubeSumsCls(dimensions, cube.sums[cls._slice_idx_expr(cube, slice_idx)])
 
@@ -1202,11 +1212,15 @@ def factory(cls, cube, dimensions, slice_idx):
         UnconditionalCubeCountsCls = (
             _MrXMrUnconditionalCubeCounts
             if dimension_types == (DT.MR, DT.MR)
-            else _MrXCatUnconditionalCubeCounts
-            if dimension_types[0] == DT.MR
-            else _CatXMrUnconditionalCubeCounts
-            if dimension_types[1] == DT.MR
-            else _CatXCatUnconditionalCubeCounts
+            else (
+                _MrXCatUnconditionalCubeCounts
+                if dimension_types[0] == DT.MR
+                else (
+                    _CatXMrUnconditionalCubeCounts
+                    if dimension_types[1] == DT.MR
+                    else _CatXCatUnconditionalCubeCounts
+                )
+            )
         )
         return UnconditionalCubeCountsCls(
             dimensions,