Merge pull request #12 from artefactory/new_sig

New Signature
artefactory · Jan 15, 2024 · 12a602e · 12a602e
2 parents fb4e1ba + dc6fdf3
commit 12a602e
Show file tree

Hide file tree

Showing 13 changed files with 2,136 additions and 912 deletions.
diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py
diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py
diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py
@@ -0,0 +1,292 @@
+"""Different classes to optimize RAM usage with repeated features over time."""
+from abc import ABC, abstractmethod
+
+import numpy as np
+import pandas as pd
+
+from choice_learn.data.indexer import OneHotStorageIndexer, StorageIndexer
+
+
+class Storage(ABC):
+    """Parent Class to have OneHotStorage and FeaturesStorage with same parent."""
+
+    def __init__(self, features_to_store):
+        """Instantiation.
+
+        Parameters
+        ----------
+        features_to_store : object
+            Object to store
+        """
+        self.features_to_store = features_to_store
+
+    @abstractmethod
+    def __getitem__(self, keys):
+        """Base function to access an element. To be implemented in children classes.
+
+        Parameters
+        ----------
+        keys : float, int, str or list of
+            values among indexes of the stiage
+        """
+        pass
+
+    @abstractmethod
+    def __len__(self):
+        """Returns the length of the sequence of apparition of the features."""
+        pass
+
+    @property
+    def batch(self):
+        """Indexing method."""
+        pass
+
+
+class FeaturesStorage(Storage):
+    """Function to store features with ids."""
+
+    def __init__(self, ids=None, values=None, values_names=None, name=None, indexer=StorageIndexer):
+        """Builds the store.
+
+        Parameters
+        ----------
+        ids : array_like or None
+            list of ids of features to store. If None is given, ids are created from
+            apparition order of values
+        values : array_like
+            list of values of features to store
+        values_names : array_like
+            Iterable of str indicating the name of the features. Must be same length as values.
+        name: string, optional
+            name of the features store
+        """
+        if isinstance(values, dict):
+            storage = values
+            lengths = []
+            for k, v in storage.items():
+                if not isinstance(v, np.ndarray) | isinstance(v, list):
+                    raise ValueError("values must be a dict of np.ndarray or list")
+                if not len(np.array(v).shape) == 1:
+                    raise ValueError(
+                        "values (features) must be a dict of np.ndarray or list of 1D arrays"
+                    )
+                lengths.append(len(v))
+                if isinstance(v, list):
+                    storage[k] = np.array(v)
+            if not len(set(lengths)) == 1:
+                raise ValueError("values (dict values) must all have same length")
+            if ids is not None:
+                print("Warning: ids is ignored when values is a dict")
+
+        elif isinstance(values, pd.DataFrame):
+            if values_names is not None:
+                print("Warning: values_names is ignored when values is a DataFrame")
+            if "id" in values.columns:
+                values = values.set_index("id")
+            values_names = values.columns
+            storage = {k: v.to_numpy() for (k, v) in values.iterrows()}
+        elif isinstance(values, list) or isinstance(values, np.ndarray):
+            if ids is None:
+                ids = list(range(len(values)))
+            storage = {k: np.array(v) for (k, v) in zip(ids, values)}
+        else:
+            raise ValueError("values must be a dict, a DataFrame, a list or a numpy array")
+
+        self.storage = storage
+        self.values_names = values_names
+        self.name = name
+
+        self.shape = (len(self), len(next(iter(self.storage.values()))))
+        self.indexer = indexer(self)
+
+    def get_element_from_index(self, index):
+        """Getter method over self.sequence.
+
+        Returns the features stored at index index. Compared to __getitem__, it does take
+        the index-th element of sequence but the index-th element of the store.
+
+        Parameters
+        ----------
+        index : (int, list, slice)
+            index argument of the feature
+
+        Returns:
+        --------
+        array_like
+            features corresponding to the index index in self.store
+        """
+        if isinstance(index, int):
+            index = [index]
+        keys = [list(self.storage.keys())[i] for i in index]
+        return self.batch[keys]
+
+    def __len__(self):
+        """Returns the length of the sequence of apparition of the features."""
+        return len(self.storage)
+
+    def __getitem__(self, id_keys):
+        """Subset FeaturesStorage, keeping only features which id is in keys.
+
+        Parameters
+        ----------
+        id_keys : Iterable
+            List of ids to keep.
+
+        Returns:
+        --------
+        FeaturesStorage
+            Subset of the FeaturesStorage, with only the features whose id is in id_keys
+        """
+        if not isinstance(id_keys, list):
+            id_keys = [id_keys]
+        sub_storage = {k: self.storage[k] for k in id_keys}
+        return FeaturesStorage(values=sub_storage, values_names=self.values_names, name=self.name)
+
+    def get_storage_type(self):
+        """Functions to access stored elements dtypes.
+
+        Returns:
+        --------
+        tuple
+            tuple of dtypes of the stored elements, as returned by np.dtype
+        """
+        element = self.get_element_from_index(0)
+        return element.dtype
+
+    @property
+    def batch(self):
+        """Indexing attribute."""
+        return self.indexer
+
+
+class OneHotStorage(Storage):
+    """Specific Storage for one hot features storage.
+
+    Inherits from Storage.
+    For example can be used to store a OneHot representation of the days of week.
+
+    Has the same attributes as FeaturesStoage, only differs whit some One-Hot optimized methods.
+    It only stores the indexes of the features, and creates the OneHot matrix
+    when needed, using .batch[].
+    """
+
+    def __init__(
+        self, ids=None, values=None, name=None, dtype=np.uint8, indexer=OneHotStorageIndexer
+    ):
+        """Builds the store.
+
+        Parameters
+        ----------
+        ids : array_like or None
+            list of ids of features to store. If None is given, ids are created from
+            apparition order of values
+        values : array_like
+            list of values of features to store
+        dtype: type
+            type for One Hot representation, usually int or float, default is np.uint8
+        name: string, optional
+            name of the features store
+        """
+        if isinstance(values, dict):
+            storage = values
+            for k, v in storage.items():
+                if not isinstance(v, int):
+                    raise ValueError(
+                        """values of values dict must be int as
+                        they are indexes of the one hot vector ones."""
+                    )
+            length = np.max(list(storage.values())) + 1
+            if ids is not None:
+                print("Warning: ids is ignored when values is a dict")
+
+        elif isinstance(values, list) or isinstance(values, np.ndarray):
+            if ids is None:
+                ids = list(range(len(values)))
+            storage = {k: int(v) for (k, v) in zip(ids, values)}
+            length = np.max(values) + 1
+
+        elif values is None:
+            if ids is None:
+                raise ValueError("ids or values must be given, both are None")
+            value = 0
+            storage = {}
+            for id in ids:
+                storage[id] = value
+                value += 1
+            length = value
+        else:
+            raise ValueError("values must be a dict, a DataFrame, a list or a numpy array")
+
+        self.storage = storage
+        self.name = name
+
+        self.shape = (len(self), length)
+        self.dtype = dtype
+        self.indexer = indexer(self)
+
+    def __len__(self):
+        """Returns the length of the sequence of apparition of the features."""
+        return len(self.storage)
+
+    def __getitem__(self, id_keys):
+        """Subset FeaturesStorage, keeping only features which id is in keys.
+
+        Parameters
+        ----------
+        id_keys : Iterable
+            List of ids to keep.
+
+        Returns:
+        --------
+        OneHotStorage
+            Subset of the OneHotStorage, with only the features whose id is in id_keys
+        """
+        if isinstance(id_keys, int):
+            id_keys = [id_keys]
+        sub_storage = {k: self.storage[k] for k in id_keys}
+
+        return OneHotStorage(values=sub_storage, name=self.name, dtype=self.dtype)
+
+    def astype(self, dtype):
+        """Method to change (mainly int or float) type of returned OneHot features vectors.
+
+        Parameters
+        ----------
+        dtype : type
+            Type to set the features as
+        """
+        self.dtype = dtype
+
+    def get_element_from_index(self, index):
+        """Getter method over self.sequence.
+
+        Returns the features stored at index index. Compared to __getitem__, it does take
+        the index-th element of sequence but the index-th element of the store.
+
+        Parameters
+        ----------
+        index : (int, list, slice)
+            index argument of the feature
+
+        Returns:
+        --------
+        array_like
+            features corresponding to the index index in self.store
+        """
+        keys = list(self.storage.keys())[index]
+        return self.storage[keys]
+
+    def get_storage_type(self):
+        """Functions to access stored elements dtypes.
+
+        Returns:
+        --------
+        type
+            tuple of dtypes of the stored elements, as returned by np.dtype
+        """
+        return self.dtype
+
+    @property
+    def batch(self):
+        """Indexing attribute."""
+        return self.indexer
diff --git a/choice_learn/models/base_model.py b/choice_learn/models/base_model.py
@@ -472,13 +472,15 @@ def load_model(cls, path):
         # Load optimizer step
         return cls
 
-    def predict_probas(self, choice_dataset):
+    def predict_probas(self, choice_dataset, batch_size=-1):
         """Predicts the choice probabilities for each session and each product of a ChoiceDataset.
 
         Parameters
         ----------
         choice_dataset : ChoiceDataset
             Dataset on which to apply to prediction
+        batch_size : int, optional
+            Batch size to use for the prediction, by default -1
 
         Returns:
         --------
@@ -492,7 +494,7 @@ def predict_probas(self, choice_dataset):
             sessions_items_batch,
             availabilities_batch,
             choices_batch,
-        ) in choice_dataset.iter_batch():
+        ) in choice_dataset.iter_batch(batch_size=batch_size):
             _, probabilities = self.batch_predict(
                 items_batch,
                 sessions_batch,
@@ -504,7 +506,7 @@ def predict_probas(self, choice_dataset):
 
         return tf.concat(stacked_probabilities, axis=0)
 
-    def evaluate(self, choice_dataset, batch_size=None):
+    def evaluate(self, choice_dataset, batch_size=-1):
         """Evaluates the model for each session and each product of a ChoiceDataset.
 
         Predicts the probabilities according to the model and computes the Negative-Log-Likelihood
@@ -520,8 +522,6 @@ def evaluate(self, choice_dataset, batch_size=None):
         np.ndarray (n_sessions, n_items)
             Choice probabilties for each session and each product
         """
-        if batch_size is None:
-            batch_size = choice_dataset.batch_size
         batch_losses = []
         for (
             items_batch,