From 5b957c8e9f14cfcaa6f881eb90010099f1cafa78 Mon Sep 17 00:00:00 2001 From: Emmanuel MALHERBE Date: Fri, 5 Jan 2024 09:58:17 +0100 Subject: [PATCH 01/22] ADD: completely WIP proposal, for a different ChoiceDataset signature (core init) --- choice_learn/data/choice_dataset.py | 287 ++++++++++------------------ 1 file changed, 99 insertions(+), 188 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index fdacbabc..815d1c07 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -24,15 +24,12 @@ class ChoiceDataset(object): def __init__( self, - items_features=None, - sessions_features=None, - sessions_items_features=None, - items_features_names=None, - sessions_features_names=None, - sessions_items_features_names=None, - sessions_items_availabilities=None, - choices=None, # Should not have None as default value ? - batch_size=16, + choices, # Should not have None as default value ? + available_items_features, # MUST INCLUDE item_id column; possible: column "available" binaire + contexts_features=None, # as many context as choices. values or ids (look at key) + features_by_ids=None, # list of FeaturesById --> requires to have df with col "_id" + #choices_features_names=None, # optional way to provide keys + #items_features_names=None, # optional way to provide keys shuffle=False, ): """Builds the ChoiceDataset. @@ -42,86 +39,64 @@ def __init__( items_features : tuple of (array_like, ) matrix of shape (num_items, num_items_features) containing the features of the items e.g. item color - sessions_features : tuple of (array_like, ) - matrix of shape (num_sessions, num_sess_features) containing the features of the - sessions e.g. day of week - sessions_items_features : tuple of (array_like, ) - matrix of shape (num_sessions, num_items, num_ses_items_features) containing the item - features varying over sessions, e.g. prices - sessions_items_availabilities : array_like - binary matrix of shape (num_sessions, num_items) containing the availabitilies of - products (1. if present 0. otherwise) over sessions choices: list of list - for each sessions we have a list of related choices. Main list has same legnth as - session_features and sessions_items_features. - batch_size: int, optional - size of the batches to return in __iter__ method + for each choice we have a list of related choices. Main list has same legnth as + session_features. suffle: bool, optional whether to shuffle the dataset or not """ + + if choices is None: + # Done to keep a logical order of arguments, and has logic: choices have to be specified + raise ValueError("Choices must be specified, got None") + + assert len(choices) == len(available_items) + assert contexts_features is None or len(choices) == len(contexts_features) + # --------- [ Handling features type given as tuples or not ] --------- # # If items_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names - if not isinstance(items_features, tuple) and items_features is not None: - items_features = (items_features,) - items_features_names = (items_features_names,) - self._return_items_features_tuple = False - # items_features is already a tuple, names are given, checking consistency - elif items_features is not None and items_features_names is not None: - if ( - len(items_features) != len(items_features_names) - and items_features_names is not None - ): - raise ValueError("items_features shape and items_features_names shape do not match") - self._return_items_features_tuple = True - # In this case names are missing, still transform it as a tuple - elif items_features is not None: - self._return_items_features_tuple = True - items_features_names = (None,) * len(items_features) - - # If sessions_features is not given as tuple, transform it internally as a tuple - # A bit longer because can be None and need to also handle names - if not isinstance(sessions_features, tuple) and sessions_features is not None: - sessions_features = (sessions_features,) - sessions_features_names = (sessions_features_names,) - self._return_sessions_features_tuple = False - # sessions_features is already a tuple, names are given, checking consistency - elif sessions_features is not None and sessions_features_names is not None: - if ( - len(sessions_features) != len(sessions_features_names) - and sessions_features_names is not None - ): - raise ValueError( - "sessions_features shape and sessions_features_names shape \ - do not match" - ) - self._return_sessions_features_tuple = True - # In this case names are missing, still transform it as a tuple - elif sessions_features is not None: - self._return_sessions_features_tuple = True - sessions_features_names = (None,) * len(sessions_features) + items_features = None # to pass this part + if items_features is not None: + if not isinstance(items_features, tuple): + items_features = (items_features,) + items_features_names = (items_features_names,) + self._return_items_features_tuple = False + # items_features is already a tuple, names are given, checking consistency + elif items_features_names is not None: + if ( + len(items_features) != len(items_features_names) + and items_features_names is not None + ): + raise ValueError("items_features shape and items_features_names shape do not match") + self._return_items_features_tuple = True + # In this case names are missing, still transform it as a tuple + else: + self._return_items_features_tuple = True + items_features_names = (None,) * len(items_features) - # If sessions_items_features is not given as tuple, transform it internally as a tuple + # If choices_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names - if not isinstance(sessions_items_features, tuple) and sessions_items_features is not None: - sessions_items_features = (sessions_items_features,) - sessions_items_features_names = (sessions_items_features_names,) - self._return_sessions_items_features_tuple = False - # sessions_items_features is already a tuple, names are given, checking consistency - elif sessions_items_features is not None and sessions_items_features_names is not None: - if ( - len(sessions_items_features) != len(sessions_items_features_names) - and sessions_items_features_names is not None - ): - raise ValueError( - "sessions_items_features shape and \ - sessions_items_features_names shape do not match" - ) - self._return_sessions_items_features_tuple = True - # In this case names are missing, still transform it as a tuple - elif sessions_items_features is not None: - self._return_sessions_items_features_tuple = True - sessions_items_features_names = (None,) * len(sessions_items_features) + if contexts_features is not None: + if not isinstance(contexts_features, tuple): + contexts_features = (contexts_features,) + #choices_features_names = (choices_features_names,) + self._return_choices_features_tuple = False + # choices_features is already a tuple, names are given, checking consistency + elif choices_features_names is not None: + if ( + len(contexts_features) != len(choices_features_names) + and choices_features_names is not None + ): + raise ValueError( + "choices_features shape and choices_features_names shape \ + do not match" + ) + self._return_choices_features_tuple = True + # In this case names are missing, still transform it as a tuple + else: + self._return_choices_features_tuple = True + choices_features_names = (None,) * len(contexts_features) # --------- [Normalizing features types (DataFrame, List, etc...) -> np.ndarray] --------- # # @@ -151,71 +126,37 @@ def __init__( items_features[:i] + (np.array(feature),) + items_features[i + 1 :] ) - # Handling sessions_features - if sessions_features is not None: - for i, feature in enumerate(sessions_features): + # Handling choices_features + if contexts_features is not None: + for i, feature in enumerate(contexts_features): if isinstance(feature, pd.DataFrame): - # Ordering sessions by id ? + # Ordering choices by id ? if "session_id" in feature.columns: feature = feature.set_index("session_id") - sessions_features = ( - sessions_features[:i] + contexts_features = ( + contexts_features[:i] + (feature.loc[np.sort(feature.index)].to_numpy(),) - + sessions_features[i + 1 :] - ) - sessions_features_names = ( - sessions_features_names[:i] - + (feature.columns.tolist(),) - + sessions_features_names[i + 1 :] + + contexts_features[i + 1 :] ) + #choices_features_names = ( + # choices_features_names[:i] + # + (feature.columns.tolist(),) + # + choices_features_names[i + 1 :] + #) elif isinstance(feature, list): - sessions_features = ( - sessions_features[:i] + (np.array(feature),) + sessions_features[i + 1 :] + contexts_features = ( + contexts_features[:i] + (np.array(feature),) + contexts_features[i + 1 :] ) - # Handling sessions_items_features - if sessions_items_features is not None: - for i, feature in enumerate(sessions_items_features): - if isinstance(feature, pd.DataFrame): - # Ordering sessions and items by id ? - if "session_id" not in feature.columns: - feature["session_id"] = feature.index - items_index = np.sort(feature.item_id.unique()) - sessions_index = np.sort(feature.session_id.unique()) - names = [f for f in feature.columns if f != "session_id" and f != "item_id"] - - ( - feature, - sessions_items_availabilities, - ) = self._sessions_items_features_df_to_np( - feature, items_index, sessions_index, feature.columns.tolist() - ) - - sessions_items_features = ( - sessions_items_features[:i] + feature + sessions_items_features[i + 1 :] - ) - - sessions_items_features_names = ( - sessions_items_features_names[:i] - + (names,) - + sessions_items_features_names[i + 1 :] - ) - elif isinstance(feature, list): - sessions_items_features = ( - sessions_items_features[:i] - + (np.array(feature),) - + sessions_items_features[i + 1 :] - ) - - if isinstance(sessions_items_availabilities, list): - sessions_items_availabilities = np.array(sessions_items_availabilities) + if isinstance(available_items, list): + available_items = np.array(available_items, dtype=object) # Handling choices # Choices must then be given as the name of the chosen item # Items are sorted by name and attributed an index # Cannot be a list of choices yet if isinstance(choices, pd.DataFrame): - # Ordering sessions by id + # Ordering choices by id if "session_id" in choices.columns: choices = choices.set_index("session_id") choices = choices.loc[np.sort(choices.index)] @@ -224,29 +165,22 @@ def __init__( choices = [np.where(items == c)[0] for c in choices.choice] # Setting attributes of ChoiceDataset - self.items_features = items_features - self.sessions_features = sessions_features - self.sessions_items_features = sessions_items_features - self.sessions_items_availabilities = sessions_items_availabilities + self.contexts_features = contexts_features + self.available_items = available_items - self.items_features_names = items_features_names - self.sessions_features_names = sessions_features_names - self.sessions_items_features_names = sessions_items_features_names + #self.items_features_names = items_features_names + #self.choices_features_names = choices_features_names - self.batch_size = batch_size self.shuffle = shuffle - if choices is None: - # Done to keep a logical order of arguments, and has logic: choices have to be specified - raise ValueError("Choices must be specified, got None") self.ragged_choices = choices self.indexes, self.choices = self._build_indexes(choices) self.n_choices = len(self.choices) # Different consitency checks to ensure everythin is coherent - self._check_dataset() # Should handle alone if np.arrays are squeezed - self._return_types = self._check_types() - self._check_names() + #self._check_dataset() # Should handle alone if np.arrays are squeezed + #self._return_types = self._check_types() + #self._check_names() # Build .iloc method self.indexer = ChoiceDatasetIndexer(self) @@ -292,7 +226,7 @@ def _check_dataset(self): Particularly: - Over number of items - - Over number of sessions + - Over number of choices Verifies that the choices have coherent values """ self._check_num_items_shapes() @@ -465,14 +399,14 @@ def _check_names(self): ) def __len__(self): - """Returns length of the dataset e.g. total number of sessions. + """Returns length of the dataset e.g. total number of choices. Returns: ------- int - total number of sessions + total number of choices """ - return self.base_num_sessions + return len(self.choices) def get_num_items(self): """Method to access the total number of different items. @@ -484,28 +418,18 @@ def get_num_items(self): """ return self.base_num_items - def get_num_sessions(self): - """Method to access the total number of different sessions. + def get_num_choices(self): + """Method to access the total number of different choices. Redundant with __len__ method. Returns: ------- int - total number of different sessions + total number of different choices """ return len(self) - def get_num_choices(self): - """Method to access the total number of different sessions. - - Returns: - ------- - int - total number of different sessions - """ - return self.n_choices - @classmethod def _sessions_items_features_df_to_np( cls, @@ -725,17 +649,14 @@ def summary(self): else: print("No sessions items features registered") - def get_choice_batch(self, choice_index): + def get_choices_batch(self, choices_indexes, features=None): """Method to access data within the ListChoiceDataset from its index. One index corresponds to a choice within a session. Return order: - - Fixed item features - - Session features - - Session item features - - Items availabilities - - Choice + - df_chosen_item, df of length batch_size + - dfs_available_items, list of lentch batch_size of dfs of length n_available_items Parameters ---------- @@ -838,16 +759,10 @@ def get_choice_batch(self, choice_index): else: sessions_items_availabilities = self.sessions_items_availabilities[session_index] - return ( - items_features, - sessions_features, - sessions_items_features, - sessions_items_availabilities, - choice, - ) + return df_chosen_item, dfs_available_items - def __getitem__(self, session_indexes): - """Method to create a sub-ChoiceDataset with only a subset of sessions, from their indexes. + def __getitem__(self, choices_indexes): + """Method to create a sub-ChoiceDataset with only a subset of choices, from their indexes. Parameters ---------- @@ -876,14 +791,13 @@ def __getitem__(self, session_indexes): ), sessions_items_availabilities=self.sessions_items_availabilities[session_indexes], choices=[self.ragged_choices[i] for i in session_indexes], - batch_size=self.batch_size, items_features_names=self.items_features_names, sessions_features_names=self.sessions_features_names, sessions_items_features_names=self.sessions_items_features_names, ) - def old_batch(self, batch_size=None, shuffle=None, sample_weight=None): - """Iterates over dataset return batches of length self.batch_size. + def old_batch(self, batch_size, shuffle=None, sample_weight=None): + """Iterates over dataset return batches of length batch_size. Parameters ---------- @@ -894,8 +808,7 @@ def old_batch(self, batch_size=None, shuffle=None, sample_weight=None): sample_weight : Iterable list of weights to be returned with the right indexing during the shuffling """ - if batch_size is None: - batch_size = self.batch_size + if shuffle is None: shuffle = self.shuffle if batch_size == -1: @@ -933,8 +846,8 @@ def batch(self): """Indexer.""" return self.indexer - def iter_batch(self, batch_size=None, shuffle=None, sample_weight=None): - """Iterates over dataset return batches of length self.batch_size. + def iter_batch(self, batch_size, shuffle=None, sample_weight=None): + """Iterates over dataset return batches of length batch_size. Newer version. @@ -947,8 +860,7 @@ def iter_batch(self, batch_size=None, shuffle=None, sample_weight=None): sample_weight : Iterable list of weights to be returned with the right indexing during the shuffling """ - if batch_size is None: - batch_size = self.batch_size + if shuffle is None: shuffle = self.shuffle if batch_size == -1: @@ -986,6 +898,5 @@ def filter(self, bool_list): list of booleans of length self.get_num_sessions() to filter sessions. True to keep, False to discard. """ - indexes = list(range(len(bool_list))) - indexes = [i for i, keep in zip(indexes, bool_list) if keep] + indexes = [i for i, keep in enumerate(bool_list) if keep] return self[indexes] From 31c7435a92e88552121a3a4820e2581a88de592e Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 9 Jan 2024 18:33:00 +0100 Subject: [PATCH 02/22] intermediate new signature --- choice_learn/data/choice_dataset.py | 129 +++++++++++++++++++--------- choice_learn/data/storage.py | 74 ++++++++++++++++ 2 files changed, 162 insertions(+), 41 deletions(-) create mode 100644 choice_learn/data/storage.py diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 815d1c07..a8599bff 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -25,12 +25,14 @@ class ChoiceDataset(object): def __init__( self, choices, # Should not have None as default value ? - available_items_features, # MUST INCLUDE item_id column; possible: column "available" binaire - contexts_features=None, # as many context as choices. values or ids (look at key) - features_by_ids=None, # list of FeaturesById --> requires to have df with col "_id" - #choices_features_names=None, # optional way to provide keys - #items_features_names=None, # optional way to provide keys - shuffle=False, + fixed_items_features=None, + contexts_features=None, # as many context as choices. values or ids (look at key) + contexts_items_features=None, # MUST INCLUDE item_id column; possible: column "available" binary + contexts_items_availabilities=None, + features_by_ids=None, # list of (name, FeaturesStorage) --> requires to have df with col "_id" + fixed_items_features_names=None, + contexts_features_names=None, + contexts_items_features_names=None, ): """Builds the ChoiceDataset. @@ -50,13 +52,11 @@ def __init__( # Done to keep a logical order of arguments, and has logic: choices have to be specified raise ValueError("Choices must be specified, got None") - assert len(choices) == len(available_items) - assert contexts_features is None or len(choices) == len(contexts_features) - # --------- [ Handling features type given as tuples or not ] --------- # # If items_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names - items_features = None # to pass this part + + items_features = None # to pass this part if items_features is not None: if not isinstance(items_features, tuple): items_features = (items_features,) @@ -68,7 +68,9 @@ def __init__( len(items_features) != len(items_features_names) and items_features_names is not None ): - raise ValueError("items_features shape and items_features_names shape do not match") + raise ValueError( + "items_features shape and items_features_names shape do not match" + ) self._return_items_features_tuple = True # In this case names are missing, still transform it as a tuple else: @@ -80,7 +82,7 @@ def __init__( if contexts_features is not None: if not isinstance(contexts_features, tuple): contexts_features = (contexts_features,) - #choices_features_names = (choices_features_names,) + # choices_features_names = (choices_features_names,) self._return_choices_features_tuple = False # choices_features is already a tuple, names are given, checking consistency elif choices_features_names is not None: @@ -105,28 +107,30 @@ def __init__( # names as features names # Handling items_features - if items_features is not None: - for i, feature in enumerate(items_features): + if fixed_items_features is not None: + for i, feature in enumerate(fixed_items_features): if isinstance(feature, pd.DataFrame): # Ordering items by id ? if "item_id" in feature.columns: feature = feature.set_index("item_id") - items_features = ( - items_features[:i] + fixed_items_features = ( + fixed_items_features[:i] + (feature.loc[np.sort(feature.index)].to_numpy(),) - + items_features[i + 1 :] + + fixed_items_features[i + 1 :] ) - items_features_names = ( - items_features_names[:i] + fixed_items_features_names = ( + fixed_items_features_names[:i] + (feature.columns.tolist(),) - + items_features_names[i + 1 :] + + fixed_items_features_names[i + 1 :] ) elif isinstance(feature, list): - items_features = ( - items_features[:i] + (np.array(feature),) + items_features[i + 1 :] + fixed_items_features = ( + fixed_items_features[:i] + + (np.array(feature),) + + fixed_items_features[i + 1 :] ) - # Handling choices_features + # Handling context features if contexts_features is not None: for i, feature in enumerate(contexts_features): if isinstance(feature, pd.DataFrame): @@ -138,18 +142,61 @@ def __init__( + (feature.loc[np.sort(feature.index)].to_numpy(),) + contexts_features[i + 1 :] ) - #choices_features_names = ( - # choices_features_names[:i] - # + (feature.columns.tolist(),) - # + choices_features_names[i + 1 :] - #) elif isinstance(feature, list): contexts_features = ( contexts_features[:i] + (np.array(feature),) + contexts_features[i + 1 :] ) - - if isinstance(available_items, list): - available_items = np.array(available_items, dtype=object) + # Handling contexts_items_features + if contexts_items_features is not None: + for i, feature in enumerate(contexts_items_features): + if isinstance(feature, pd.DataFrame): + # Ordering choices by id ? + if "session_id" in feature.columns: + if "item_id" in feature.columns: + feature_array = [] + for sess in np.sort(feature.session_id): + sess_df = feature.loc[feature.session_id == sess] + sess_df = sess_df.set_index("item_id") + feature_array.append(sess_df.loc[np.sort(sess_df.index)].to_numpy()) + contexts_items_features = ( + contexts_items_features[:i] + + (np.stack(feature_array, axis=0),) + + contexts_items_features[i + 1 :] + ) + else: + feature = feature.set_index("session_id") + contexts_items_features = ( + contexts_items_features[:i] + + (feature.loc[np.sort(feature.index)].to_numpy(),) + + contexts_items_features[i + 1 :] + ) + elif isinstance(feature, list): + contexts_items_features = ( + contexts_items_features[:i] + + (np.array(feature),) + + contexts_items_features[i + 1 :] + ) + if contexts_items_availabilities is not None: + if isinstance(contexts_items_availabilities, list): + contexts_items_availabilities = np.array( + contexts_items_availabilities, dtype=object + ) + elif isinstance(contexts_items_availabilities, pd.DataFrame): + if "session_id" in contexts_items_availabilities.columns: + if "item_id" in contexts_items_availabilities.columns: + av_array = [] + for sess in np.sort(contexts_items_availabilities.session_id): + sess_df = contexts_items_availabilities.loc[ + contexts_items_availabilities.session_id == sess + ] + sess_df = sess_df.set_index("item_id") + av_array.append(sess_df.loc[np.sort(sess_df.index)].to_numpy()) + contexts_items_availabilities = np.array(av_array) + else: + feature = feature.set_index("session_id") + contexts_items_availabilities = contexts_items_availabilities.loc[ + np.sort(feature.index) + ].to_numpy() # Handling choices # Choices must then be given as the name of the chosen item @@ -165,22 +212,22 @@ def __init__( choices = [np.where(items == c)[0] for c in choices.choice] # Setting attributes of ChoiceDataset + self.fixedf_items_features = fixed_items_features self.contexts_features = contexts_features - self.available_items = available_items - - #self.items_features_names = items_features_names - #self.choices_features_names = choices_features_names + self.contexts_items_features = contexts_items_features + self.context_items_availabilities = contexts_items_availabilities + self.choices = choices - self.shuffle = shuffle + self.fixed_items_features_names = fixed_items_features_names + self.contexts_features_names = contexts_features_names + self.contexts_items_features_names = contexts_items_features_names - self.ragged_choices = choices - self.indexes, self.choices = self._build_indexes(choices) self.n_choices = len(self.choices) # Different consitency checks to ensure everythin is coherent - #self._check_dataset() # Should handle alone if np.arrays are squeezed - #self._return_types = self._check_types() - #self._check_names() + # self._check_dataset() # Should handle alone if np.arrays are squeezed + # self._return_types = self._check_types() + # self._check_names() # Build .iloc method self.indexer = ChoiceDatasetIndexer(self) diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py new file mode 100644 index 00000000..e966e369 --- /dev/null +++ b/choice_learn/data/storage.py @@ -0,0 +1,74 @@ +"""Different classes to optimize RAM usage with repeated features over time.""" +import numpy as np +import panda as pd + + +class Storage(object): + """Class to keep OneHotStore and FeaturesStore with same parent.""" + + def __init__(self, ids=None, values=None, values_names=None, name=None): + """Builds the store. + + Parameters + ---------- + indexes : array_like or None + list of indexes of features to store. If None is given, indexes are created from + apparition order of values + values : array_like + list of values of features to store + sequence : array_like + sequence of apparitions of the features + name: string, optional + name of the features store -- not used at the moment + """ + if isinstance(values, dict): + pass + elif isinstance(values, pd.DataFrame): + if values_names is not None: + print("Warning: values_names is ignored when values is a DataFrame") + if "id" in values.columns: + values = values.set_index("id") + values_names = values.columns + storage = {k: v.to_numpy() for (k, v) in values.iterrows()} + elif isinstance(values, list): + pass + elif isinstance(values, np.ndarray): + pass + else: + raise ValueError("values must be a dict, a DataFrame, a list or a numpy array") + + if ids is None: + ids = list(range(len(values))) + + self.storage = {k: v for (k, v) in zip(ids, values)} + self.name = name + + def _get_store_element(self, index): + """Getter method over self.sequence. + + Returns the features stored at index index. Compared to __getitem__, it does take + the index-th element of sequence but the index-th element of the store. + + Parameters + ---------- + index : (int, list, slice) + index argument of the feature + + Returns: + -------- + array_like + features corresponding to the index index in self.store + """ + if isinstance(index, list): + return [self.store[i] for i in index] + # else: + return self.store[index] + + def __len__(self): + """Returns the length of the sequence of apparition of the features.""" + return len(self.sequence) + + @property + def batch(self): + """Indexing attribute.""" + return self.indexer From 2eb992cc3b45f9ba27819f16f8a0e1abfcb5d9fa Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 9 Jan 2024 21:42:03 +0100 Subject: [PATCH 03/22] ADD: new FeaturesStorage --- choice_learn/data/indexer.py | 33 ++++++++++++++++++ choice_learn/data/storage.py | 67 +++++++++++++++++++++++++++++------- 2 files changed, 87 insertions(+), 13 deletions(-) diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 5974f59f..38f05254 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -1,5 +1,6 @@ """Indexer classes for data classes.""" from abc import abstractmethod +from collections.abc import Iterable import numpy as np @@ -64,6 +65,38 @@ def __getitem__(self, sequence_index): ] return self.store.store[self.store.sequence[sequence_index]] +class StorageIndexer(Indexer): + """Class for Ilocing/Batching FeaturesStorage.""" + + def __init__(self, storage): + """StoreIndexer constructor. + + Parameters + ---------- + store : choice_modeling.data.store.FeaturesStore + Store object to be indexed. + """ + self.storage = storage + + def __getitem__(self, sequence_keys): + """Returns the features appearing at the sequence_index-th position of sequence. + + Parameters + ---------- + sequence_index : (int, list, slice) + index position of the sequence + + Returns: + -------- + array_like + features corresponding to the sequence_index-th position of sequence + """ + if isinstance(sequence_keys, Iterable): + return np.array([self.storage.storage[key] for key in sequence_keys]) + if isinstance(sequence_keys, slice): + raise ValueError("Slicing is not supported for storage") + return np.array(self.storage.storage[sequence_keys]) + class OneHotStoreIndexer(Indexer): """Class for Ilocing OneHotStore.""" diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py index e966e369..3d2e2b60 100644 --- a/choice_learn/data/storage.py +++ b/choice_learn/data/storage.py @@ -1,12 +1,34 @@ """Different classes to optimize RAM usage with repeated features over time.""" +from abc import ABC, abstractmethod + import numpy as np -import panda as pd +import pandas as pd +from choice_learn.data.indexer import StorageIndexer -class Storage(object): +class Storage(ABC): """Class to keep OneHotStore and FeaturesStore with same parent.""" - def __init__(self, ids=None, values=None, values_names=None, name=None): + def __init__(self, features_to_store): + self.features_to_store = features_to_store + + @abstractmethod + def __getitem__(self, keys): + pass + + @abstractmethod + def __len__(self): + pass + + @property + def batch(self): + pass + + +class FeaturesStorage(Storage): + """Class to keep OneHotStore and FeaturesStore with same parent.""" + + def __init__(self, ids=None, values=None, values_names=None, name=None, indexer=StorageIndexer): """Builds the store. Parameters @@ -22,7 +44,14 @@ def __init__(self, ids=None, values=None, values_names=None, name=None): name of the features store -- not used at the moment """ if isinstance(values, dict): - pass + storage = values + lengths = [] + for k, v in storage.items(): + assert (isinstance(v, np.ndarray) | isinstance(v, list)) + assert len(np.array(v).shape) == 1 + lengths.append(len(v)) + assert len(set(lengths)) == 1 + elif isinstance(values, pd.DataFrame): if values_names is not None: print("Warning: values_names is ignored when values is a DataFrame") @@ -30,19 +59,20 @@ def __init__(self, ids=None, values=None, values_names=None, name=None): values = values.set_index("id") values_names = values.columns storage = {k: v.to_numpy() for (k, v) in values.iterrows()} - elif isinstance(values, list): - pass - elif isinstance(values, np.ndarray): - pass + elif isinstance(values, list) or isinstance(values, np.ndarray): + if ids is None: + ids = list(range(len(values))) + storage = {k: v for (k, v) in zip(ids, values)} else: raise ValueError("values must be a dict, a DataFrame, a list or a numpy array") - if ids is None: - ids = list(range(len(values))) - - self.storage = {k: v for (k, v) in zip(ids, values)} + self.storage = storage + self.values_names = values_names self.name = name + self.shape = (len(self), len(next(iter(self.storage.values())))) + self.indexer = indexer(self) + def _get_store_element(self, index): """Getter method over self.sequence. @@ -66,7 +96,18 @@ def _get_store_element(self, index): def __len__(self): """Returns the length of the sequence of apparition of the features.""" - return len(self.sequence) + return len(self.storage) + + def __getitem__(self, keys): + """_summary_. + + Parameters + ---------- + keys : _type_ + _description_ + """ + sub_storage = {k: self.storage[k] for k in keys} + return FeaturesStorage(values=sub_storage, values_names=self.values_names, name=self.name) @property def batch(self): From b6289adcd0c52df66e5b256b9314ea837c9d6df1 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 9 Jan 2024 21:42:45 +0100 Subject: [PATCH 04/22] ADD: FeaturesStorage Example --- choice_learn/data/indexer.py | 1 + choice_learn/data/storage.py | 3 +- notebooks/features_storage_example.ipynb | 139 +++++++++++++++++++++++ 3 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 notebooks/features_storage_example.ipynb diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 38f05254..2c8568b1 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -65,6 +65,7 @@ def __getitem__(self, sequence_index): ] return self.store.store[self.store.sequence[sequence_index]] + class StorageIndexer(Indexer): """Class for Ilocing/Batching FeaturesStorage.""" diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py index 3d2e2b60..a96167aa 100644 --- a/choice_learn/data/storage.py +++ b/choice_learn/data/storage.py @@ -6,6 +6,7 @@ from choice_learn.data.indexer import StorageIndexer + class Storage(ABC): """Class to keep OneHotStore and FeaturesStore with same parent.""" @@ -47,7 +48,7 @@ def __init__(self, ids=None, values=None, values_names=None, name=None, indexer= storage = values lengths = [] for k, v in storage.items(): - assert (isinstance(v, np.ndarray) | isinstance(v, list)) + assert isinstance(v, np.ndarray) | isinstance(v, list) assert len(np.array(v).shape) == 1 lengths.append(len(v)) assert len(set(lengths)) == 1 diff --git a/notebooks/features_storage_example.ipynb b/notebooks/features_storage_example.ipynb new file mode 100644 index 00000000..4ed33770 --- /dev/null +++ b/notebooks/features_storage_example.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Remove GPU use\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n", + "\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "sys.path.append(\"../\")\n", + "\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from choice_learn.data.storage import FeaturesStorage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = {\"customerA\": [1, 2, 3], \"customerB\": [4, 5, 6], \"customerC\": [7, 8, 9]}\n", + "\n", + "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "storage[[\"customerA\", \"customerC\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n", + "ids = [\"customerA\", \"customerB\", \"customerC\"]\n", + "\n", + "storage = FeaturesStorage(ids=ids, values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")\n", + "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]\n", + "\n", + "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")\n", + "storage.batch[[0, 2, 0, 2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = {\"age\": [1, 2, 3], \"income\": [4, 5, 6], \"children_nb\": [7, 8, 9], \"id\": [\"customerA\", \"customerB\", \"customerC\"]}\n", + "features = pd.DataFrame(features)\n", + "storage = FeaturesStorage(values=features, name=\"customers\")\n", + "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = {\"age\": [1, 2, 3], \"income\": [4, 5, 6], \"children_nb\": [7, 8, 9]}\n", + "features = pd.DataFrame(features, index=[\"customerA\", \"customerB\", \"customerC\"])\n", + "storage = FeaturesStorage(values=features, name=\"customers\")\n", + "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tf_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e4b07270a7fd102092c33fe2ef86b2d5f7e34a13 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 9 Jan 2024 23:15:31 +0100 Subject: [PATCH 05/22] ADD: basic CD init running --- choice_learn/data/choice_dataset.py | 175 ++++++++++++++++------------ 1 file changed, 103 insertions(+), 72 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index a8599bff..cbf57729 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -29,7 +29,7 @@ def __init__( contexts_features=None, # as many context as choices. values or ids (look at key) contexts_items_features=None, # MUST INCLUDE item_id column; possible: column "available" binary contexts_items_availabilities=None, - features_by_ids=None, # list of (name, FeaturesStorage) --> requires to have df with col "_id" + features_by_ids=[], # list of (name, FeaturesStorage) --> requires to have df with col "_id" fixed_items_features_names=None, contexts_features_names=None, contexts_items_features_names=None, @@ -56,49 +56,77 @@ def __init__( # If items_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names - items_features = None # to pass this part - if items_features is not None: - if not isinstance(items_features, tuple): - items_features = (items_features,) - items_features_names = (items_features_names,) + if fixed_items_features is not None: + if not isinstance(fixed_items_features, tuple): + if fixed_items_features_names is not None: + assert len(fixed_items_features) == len(fixed_items_features_names), "Number of features given does not match number of features names given." self._return_items_features_tuple = False - # items_features is already a tuple, names are given, checking consistency - elif items_features_names is not None: - if ( - len(items_features) != len(items_features_names) - and items_features_names is not None - ): - raise ValueError( - "items_features shape and items_features_names shape do not match" - ) - self._return_items_features_tuple = True - # In this case names are missing, still transform it as a tuple + + fixed_items_features = (fixed_items_features,) + fixed_items_features_names = (fixed_items_features_names,) else: self._return_items_features_tuple = True - items_features_names = (None,) * len(items_features) + + # items_features is already a tuple, names are given, checking consistency + if fixed_items_features_names is not None: + for f, name in zip(fixed_items_features, fixed_items_features_names): + if ( + len(f) != len(name) + ): + raise ValueError( + "items_features shape and items_features_names shape do not match" + ) + # In this case names are missing, still transform it as a tuple + else: + fixed_items_features_names = (None,) * len(fixed_items_features) # If choices_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names if contexts_features is not None: if not isinstance(contexts_features, tuple): - contexts_features = (contexts_features,) - # choices_features_names = (choices_features_names,) self._return_choices_features_tuple = False + if contexts_items_features_names is not None: + assert len(contexts_features) == len(contexts_features_names), "Number of features given does not match number of features names given." + + contexts_features_names = (contexts_features_names,) + contexts_features = (contexts_features,) + # choices_features is already a tuple, names are given, checking consistency - elif choices_features_names is not None: - if ( - len(contexts_features) != len(choices_features_names) - and choices_features_names is not None - ): + else: + self._return_contexts_features_tuple = True + if contexts_features_names is not None: + for f, name in zip(contexts_features, contexts_features_names): + if ( + len(f) != len(name) + ): + raise ValueError( + "contexts_features shape and contexts_features_names shape do not match" + ) + + # In this case names are missing, still transform it as a tuple + else: + choices_features_names = (None,) * len(contexts_features) + + if not isinstance(contexts_items_features, tuple) and contexts_items_features is not None: + self._return_sessions_items_features_tuple = False + if contexts_items_features_names is not None: + assert len(contexts_items_features) == len(contexts_items_features_names), "Number of features given does not match number of features names given for contexts_items." + contexts_items_features = (contexts_items_features,) + contexts_items_features_names = (contexts_items_features_names,) + + # sessions_items_features is already a tuple, names are given, checking consistency + elif contexts_items_features is not None and contexts_items_features_names is not None: + for f, name in zip(contexts_items_features, contexts_items_features_names): + if len(f) != len(name): raise ValueError( - "choices_features shape and choices_features_names shape \ - do not match" + "contexts_items_features shape and \ + contexts_items_features_names shape do not match" ) - self._return_choices_features_tuple = True - # In this case names are missing, still transform it as a tuple - else: - self._return_choices_features_tuple = True - choices_features_names = (None,) * len(contexts_features) + self._return_sessions_items_features_tuple = True + # In this case names are missing, still transform it as a tuple + elif contexts_items_features is not None: + self._return_sessions_items_features_tuple = True + contexts_items_features_names = (None,) * len(contexts_items_features) # --------- [Normalizing features types (DataFrame, List, etc...) -> np.ndarray] --------- # # @@ -201,30 +229,33 @@ def __init__( # Handling choices # Choices must then be given as the name of the chosen item # Items are sorted by name and attributed an index - # Cannot be a list of choices yet if isinstance(choices, pd.DataFrame): # Ordering choices by id - if "session_id" in choices.columns: - choices = choices.set_index("session_id") + if "context_id" in choices.columns: + choices = choices.set_index("context_id") choices = choices.loc[np.sort(choices.index)] items = np.sort(np.unique(choices.choice)) # items is the value (str) of the item choices = [np.where(items == c)[0] for c in choices.choice] # Setting attributes of ChoiceDataset - self.fixedf_items_features = fixed_items_features + self.fixed_items_features = fixed_items_features self.contexts_features = contexts_features self.contexts_items_features = contexts_items_features self.context_items_availabilities = contexts_items_availabilities self.choices = choices + self.features_by_ids = features_by_ids + self.fixed_items_features_names = fixed_items_features_names self.contexts_features_names = contexts_features_names self.contexts_items_features_names = contexts_items_features_names + self._build_features_by_ids() + self.n_choices = len(self.choices) - # Different consitency checks to ensure everythin is coherent + # Different consitency checks to ensure everything is coherent # self._check_dataset() # Should handle alone if np.arrays are squeezed # self._return_types = self._check_types() # self._check_names() @@ -232,41 +263,41 @@ def __init__( # Build .iloc method self.indexer = ChoiceDatasetIndexer(self) - def _build_indexes(self, choices): - """Builds the indexes dictionnary from the choices. - - Particularly creates a flatten version of the choices and associates an index so that we can - retrieve from this index the session and the corresponding choice. - - Parameters: - ----------- - choices: list of list - raffed version of the choices - - Returns:: - -------- - indexes: dict - dictionnary of indexes: {index: corresponding_session_index} - choices: np.ndarray - flattened (1D) version of the choices - """ - try: # 1 choice by session - if len(np.squeeze(choices).shape) == 1: - indexes = {i: i for i in range(len(choices))} - flat_choices = np.squeeze(self.ragged_choices) - elif len(np.squeeze(choices).shape) == 0: - indexes = {i: i for i in range(len(choices))} - flat_choices = np.array([np.squeeze(self.ragged_choices)]) - except ValueError: # Ragged sequence of choices - indexes = {} - flat_choices = [] - total_count = 0 - for sess_nb, sess in enumerate(choices): - for choice in sess: - indexes[total_count] = sess_nb - flat_choices.append(choice) - total_count += 1 - return indexes, np.array(flat_choices) + def _build_features_by_ids(self): + if len(self.features_by_ids) == 0: + print("No features_by_ids given.") + return + + if self.fixed_items_features_names is None and self.contexts_features_names is None and self.contexts_items_features_names is None: + raise ValueError("No features_names given, match with fiven features_by_ids impossible.") + + fixed_items_features_map = [] + contexts_features_map = [] + contexts_items_features_map = [] + + if self.fixed_items_features_names is not None: + for i, feature in self.fixed_items_features_names: + for j, feature_by_id in enumerate(self.features_by_ids): + if feature == feature_by_id.name: + fixed_items_features_map.append((i, feature_by_id.batch)) + + if self.contexts_features_names is not None: + for i, feature in self.contexts_features_names: + for j, feature_by_id in enumerate(self.features_by_ids): + if feature == feature_by_id.name: + contexts_features_map.append((i, feature_by_id.batch)) + + if self.contexts_items_features_names is not None: + for i, feature in self.contexts_items_features_names: + for j, feature_by_id in enumerate(self.features_by_ids): + if feature == feature_by_id.name: + contexts_items_features_map.append((i, feature_by_id.batch)) + + assert len(fixed_items_features_map) + len(contexts_features_map) + len(contexts_items_features_map) == len(self.features_by_ids), "Some features_by_ids were not matched with features_names." + + self.fixed_items_features_map = fixed_items_features_map + self.contexts_features_map = contexts_features_map + self.contexts_items_features_map = contexts_items_features_map def _check_dataset(self): """Verifies that the shapes of the different features are consistent. From 9b533ec07def6460ebed6889d1b12858da1c6369 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Tue, 9 Jan 2024 23:15:52 +0100 Subject: [PATCH 06/22] ENH: Ruff --- choice_learn/data/choice_dataset.py | 36 ++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index cbf57729..7c28d0b5 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -59,7 +59,9 @@ def __init__( if fixed_items_features is not None: if not isinstance(fixed_items_features, tuple): if fixed_items_features_names is not None: - assert len(fixed_items_features) == len(fixed_items_features_names), "Number of features given does not match number of features names given." + assert len(fixed_items_features) == len( + fixed_items_features_names + ), "Number of features given does not match number of features names given." self._return_items_features_tuple = False fixed_items_features = (fixed_items_features,) @@ -70,9 +72,7 @@ def __init__( # items_features is already a tuple, names are given, checking consistency if fixed_items_features_names is not None: for f, name in zip(fixed_items_features, fixed_items_features_names): - if ( - len(f) != len(name) - ): + if len(f) != len(name): raise ValueError( "items_features shape and items_features_names shape do not match" ) @@ -86,7 +86,9 @@ def __init__( if not isinstance(contexts_features, tuple): self._return_choices_features_tuple = False if contexts_items_features_names is not None: - assert len(contexts_features) == len(contexts_features_names), "Number of features given does not match number of features names given." + assert len(contexts_features) == len( + contexts_features_names + ), "Number of features given does not match number of features names given." contexts_features_names = (contexts_features_names,) contexts_features = (contexts_features,) @@ -96,9 +98,7 @@ def __init__( self._return_contexts_features_tuple = True if contexts_features_names is not None: for f, name in zip(contexts_features, contexts_features_names): - if ( - len(f) != len(name) - ): + if len(f) != len(name): raise ValueError( "contexts_features shape and contexts_features_names shape do not match" ) @@ -110,7 +110,9 @@ def __init__( if not isinstance(contexts_items_features, tuple) and contexts_items_features is not None: self._return_sessions_items_features_tuple = False if contexts_items_features_names is not None: - assert len(contexts_items_features) == len(contexts_items_features_names), "Number of features given does not match number of features names given for contexts_items." + assert ( + len(contexts_items_features) == len(contexts_items_features_names) + ), "Number of features given does not match number of features names given for contexts_items." contexts_items_features = (contexts_items_features,) contexts_items_features_names = (contexts_items_features_names,) @@ -267,9 +269,15 @@ def _build_features_by_ids(self): if len(self.features_by_ids) == 0: print("No features_by_ids given.") return - - if self.fixed_items_features_names is None and self.contexts_features_names is None and self.contexts_items_features_names is None: - raise ValueError("No features_names given, match with fiven features_by_ids impossible.") + + if ( + self.fixed_items_features_names is None + and self.contexts_features_names is None + and self.contexts_items_features_names is None + ): + raise ValueError( + "No features_names given, match with fiven features_by_ids impossible." + ) fixed_items_features_map = [] contexts_features_map = [] @@ -293,7 +301,9 @@ def _build_features_by_ids(self): if feature == feature_by_id.name: contexts_items_features_map.append((i, feature_by_id.batch)) - assert len(fixed_items_features_map) + len(contexts_features_map) + len(contexts_items_features_map) == len(self.features_by_ids), "Some features_by_ids were not matched with features_names." + assert len(fixed_items_features_map) + len(contexts_features_map) + len( + contexts_items_features_map + ) == len(self.features_by_ids), "Some features_by_ids were not matched with features_names." self.fixed_items_features_map = fixed_items_features_map self.contexts_features_map = contexts_features_map From fbe8db15ee9f9753866347d4156d7263467684f4 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Wed, 10 Jan 2024 16:23:24 +0100 Subject: [PATCH 07/22] ADD: get_choice_batch ~ok --- choice_learn/data/choice_dataset.py | 280 ++++++++++++++++------------ 1 file changed, 164 insertions(+), 116 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 7c28d0b5..686e1ddd 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -79,12 +79,14 @@ def __init__( # In this case names are missing, still transform it as a tuple else: fixed_items_features_names = (None,) * len(fixed_items_features) + else: + self._return_items_features_tuple = True # If choices_features is not given as tuple, transform it internally as a tuple # A bit longer because can be None and need to also handle names if contexts_features is not None: if not isinstance(contexts_features, tuple): - self._return_choices_features_tuple = False + self._return_contexts_features_tuple = False if contexts_items_features_names is not None: assert len(contexts_features) == len( contexts_features_names @@ -105,10 +107,12 @@ def __init__( # In this case names are missing, still transform it as a tuple else: - choices_features_names = (None,) * len(contexts_features) + contexts_features_names = (None,) * len(contexts_features) + else: + self._return_contexts_features_tuple = True if not isinstance(contexts_items_features, tuple) and contexts_items_features is not None: - self._return_sessions_items_features_tuple = False + self._return_contexts_items_features_tuple = False if contexts_items_features_names is not None: assert ( len(contexts_items_features) == len(contexts_items_features_names) @@ -124,12 +128,15 @@ def __init__( "contexts_items_features shape and \ contexts_items_features_names shape do not match" ) - self._return_sessions_items_features_tuple = True + self._return_contexts_items_features_tuple = True # In this case names are missing, still transform it as a tuple elif contexts_items_features is not None: - self._return_sessions_items_features_tuple = True + self._return_contexts_items_features_tuple = True contexts_items_features_names = (None,) * len(contexts_items_features) + else: + self._return_contexts_items_features_tuple = True + # --------- [Normalizing features types (DataFrame, List, etc...) -> np.ndarray] --------- # # # Part of this code is for handling features given as pandas.DataFrame @@ -172,6 +179,11 @@ def __init__( + (feature.loc[np.sort(feature.index)].to_numpy(),) + contexts_features[i + 1 :] ) + contexts_features_names = ( + contexts_features_names[:i] + + (feature.columns,) + + contexts_features_names[i + 1 :] + ) elif isinstance(feature, list): contexts_features = ( contexts_features[:i] + (np.array(feature),) + contexts_features[i + 1 :] @@ -244,7 +256,7 @@ def __init__( self.fixed_items_features = fixed_items_features self.contexts_features = contexts_features self.contexts_items_features = contexts_items_features - self.context_items_availabilities = contexts_items_availabilities + self.contexts_items_availabilities = contexts_items_availabilities self.choices = choices self.features_by_ids = features_by_ids @@ -258,8 +270,8 @@ def __init__( self.n_choices = len(self.choices) # Different consitency checks to ensure everything is coherent - # self._check_dataset() # Should handle alone if np.arrays are squeezed - # self._return_types = self._check_types() + self._check_dataset() # Should handle alone if np.arrays are squeezed + self._return_types = self._check_types() # self._check_names() # Build .iloc method @@ -284,22 +296,28 @@ def _build_features_by_ids(self): contexts_items_features_map = [] if self.fixed_items_features_names is not None: - for i, feature in self.fixed_items_features_names: - for j, feature_by_id in enumerate(self.features_by_ids): - if feature == feature_by_id.name: - fixed_items_features_map.append((i, feature_by_id.batch)) + for i, feature in enumerate(self.fixed_items_features_names): + if feature is not None: + for j, column_name in enumerate(feature): + for feature_by_id in self.features_by_ids: + if column_name == feature_by_id.name: + fixed_items_features_map.append(((i, j), feature_by_id.batch)) if self.contexts_features_names is not None: - for i, feature in self.contexts_features_names: - for j, feature_by_id in enumerate(self.features_by_ids): - if feature == feature_by_id.name: - contexts_features_map.append((i, feature_by_id.batch)) + for i, feature in enumerate(self.contexts_features_names): + if feature is not None: + for j, column_name in enumerate(feature): + for feature_by_id in self.features_by_ids: + if column_name == feature_by_id.name: + contexts_features_map.append(((i, j), feature_by_id.batch)) if self.contexts_items_features_names is not None: - for i, feature in self.contexts_items_features_names: - for j, feature_by_id in enumerate(self.features_by_ids): - if feature == feature_by_id.name: - contexts_items_features_map.append((i, feature_by_id.batch)) + for i, feature in enumerate(self.contexts_items_features_names): + if feature is not None: + for k, column_name in enumerate(feature): + for feature_by_id in self.features_by_ids: + if column_name == feature_by_id.name: + contexts_items_features_map.append(((i, j), feature_by_id.batch)) assert len(fixed_items_features_map) + len(contexts_features_map) + len( contexts_items_features_map @@ -330,33 +348,33 @@ def _check_num_items_shapes(self): - sessions_items_availabilities Sets the argument base_num_items """ - if self.items_features is not None: - base_num_items = self.items_features[0].shape[0] - elif self.sessions_items_features is not None: - base_num_items = self.sessions_items_features[0].shape[1] - elif self.sessions_items_availabilities is not None: - base_num_items = self.sessions_items_availabilities.shape[1] + if self.fixed_items_features is not None: + base_num_items = self.fixed_items_features[0].shape[0] + elif self.contexts_items_features is not None: + base_num_items = self.contexts_items_features[0].shape[1] + elif self.contexts_items_availabilities is not None: + base_num_items = self.contexts_items_availabilities.shape[1] else: raise ValueError( "No items features, sessions items features or items availabilities are defined" ) self.base_num_items = base_num_items - if self.items_features is not None: - for items_feature in self.items_features: + if self.fixed_items_features is not None: + for items_feature in self.fixed_items_features: if items_feature.shape[0] != base_num_items: raise ValueError(f"shapes are (f{items_feature.shape[0]}, {base_num_items})") - if self.sessions_items_features is not None: - for sessions_items_feature in self.sessions_items_features: + if self.contexts_items_features is not None: + for sessions_items_feature in self.contexts_items_features: if sessions_items_feature.shape[1] != base_num_items: raise ValueError( f"shapes are (f{sessions_items_feature.shape[1]}, {base_num_items})" ) - if self.sessions_items_availabilities is not None: - if self.sessions_items_availabilities.shape[1] != base_num_items: + if self.contexts_items_availabilities is not None: + if self.contexts_items_availabilities.shape[1] != base_num_items: raise ValueError( - f"shapes are (f{self.sessions_items_availabilities.shape[1]}, \ + f"shapes are (f{self.contexts_items_availabilities.shape[1]}, \ {base_num_items})" ) @@ -369,27 +387,27 @@ def _check_num_sessions_shapes(self): - sessions_items_availabilities Sets self.base_num_sessions argument. """ - base_num_sessions = len(self.ragged_choices) + base_num_sessions = len(self.choices) self.base_num_sessions = base_num_sessions - if self.sessions_features is not None: - for sessions_feature in self.sessions_features: + if self.contexts_features is not None: + for sessions_feature in self.contexts_features: if sessions_feature.shape[0] != base_num_sessions: raise ValueError( f"shapes are ({sessions_feature.shape[0]}, {base_num_sessions})" ) - if self.sessions_items_features is not None: - for sessions_items_feature in self.sessions_items_features: + if self.contexts_items_features is not None: + for sessions_items_feature in self.contexts_items_features: if sessions_items_feature.shape[0] != base_num_sessions: raise ValueError( f"shapes are: ({sessions_items_feature.shape[0]}, \ {base_num_sessions})" ) - if self.sessions_items_availabilities is not None: - if self.sessions_items_availabilities.shape[0] != base_num_sessions: + if self.contexts_items_availabilities is not None: + if self.contexts_items_availabilities.shape[0] != base_num_sessions: raise ValueError( - f"shapes are: ({self.sessions_items_availabilities.shape[0]}, \ + f"shapes are: ({self.contexts_items_availabilities.shape[0]}, \ {base_num_sessions})" ) @@ -423,8 +441,8 @@ def _check_types(self): return_types = [] item_types = [] - if self.items_features is not None: - for item_feat in self.items_features: + if self.fixed_items_features is not None: + for item_feat in self.fixed_items_features: if np.issubdtype(item_feat[0].dtype, np.integer): item_types.append(np.int32) else: @@ -432,8 +450,8 @@ def _check_types(self): return_types.append(tuple(item_types)) session_types = [] - if self.sessions_features is not None: - for sessions_feat in self.sessions_features: + if self.contexts_features is not None: + for sessions_feat in self.contexts_features: if np.issubdtype(sessions_feat[0].dtype, np.integer): session_types.append(np.int32) else: @@ -441,8 +459,8 @@ def _check_types(self): return_types.append(tuple(session_types)) session_item_types = [] - if self.sessions_items_features is not None: - for session_item_feat in self.sessions_items_features: + if self.contexts_items_features is not None: + for session_item_feat in self.contexts_items_features: if np.issubdtype(session_item_feat[0].dtype, np.integer): session_item_types.append(np.int32) else: @@ -752,102 +770,132 @@ def get_choices_batch(self, choices_indexes, features=None): indexes of the choices (that will be mapped to choice & session indexes) to return """ - if isinstance(choice_index, list): - if self.items_features is None: - items_features = None + if isinstance(choices_indexes, list): + print(choices_indexes) + if self.fixed_items_features is None: + fixed_items_features = None else: - items_features = tuple( - items_feature.astype(self._return_types[0][i]) - for i, items_feature in enumerate(self.items_features) + fixed_items_features = tuple( + items_feature + # .astype(self._return_types[0][i]) + for i, items_feature in enumerate(self.fixed_items_features) ) - # items_features were not given as a tuple, so we return do not return it as a tuple - if not self._return_items_features_tuple: - items_features = items_features[0] - - # Get the session indexes - sessions_indexes = [self.indexes[i] for i in choice_index] - if self.sessions_features is None: - sessions_features = None + if self.contexts_features is None: + contexts_features = None else: - sessions_features = tuple( - np.stack(sessions_feature[sessions_indexes], axis=0).astype( - self._return_types[1][i] - ) - if not isinstance(sessions_feature, Store) - else sessions_feature.iloc[sessions_indexes] - for i, sessions_feature in enumerate(self.sessions_features) + contexts_features = tuple( + contexts_features[choices_indexes] + # .astype(self._return_types[1][i]) + for i, contexts_features in enumerate(self.contexts_features) ) # sessions_features were not given as a tuple, so we return do not return it # as a tuple - if not self._return_sessions_features_tuple: - sessions_features = sessions_features[0] - if self.sessions_items_features is None: - sessions_items_features = None + if self.contexts_items_features is None: + contexts_items_features = None else: - sessions_items_features = tuple( - np.stack(sessions_items_feature[sessions_indexes], axis=0).astype( - self._return_types[2][i] - ) - if not isinstance(sessions_items_feature, Store) - else sessions_items_feature.iloc[sessions_indexes] - for i, sessions_items_feature in enumerate(self.sessions_items_features) + contexts_items_features = tuple( + contexts_items_feature[choices_indexes] + # .astype(self._return_types[2][i]) + for i, contexts_items_feature in enumerate(self.contexts_items_features) ) - # sessions_items_features were not given as a tuple, so we return do not return - # it as a tuple - if not self._return_sessions_items_features_tuple: - sessions_items_features = sessions_items_features[0] - if self.sessions_items_availabilities is None: - sessions_items_availabilities = None + if self.contexts_items_availabilities is None: + contexts_items_availabilities = np.ones((len(choices_indexes), self.base_num_items)) else: - sessions_items_availabilities = self.sessions_items_availabilities[ - sessions_indexes - ].astype(self._return_types[3]) - - choice = self.choices[choice_index].astype(self._return_types[4]) + contexts_items_availabilities = self.contexts_items_availabilities[choices_indexes] + # .astype(self._return_types[3]) + + choices = self.choices[choices_indexes].astype(self._return_types[4]) + + for indexes, func in self.fixed_items_features_map: + fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + fixed_items_features[indexes[0]][:, indexes[1]] + ] + for indexes, func in self.contexts_features_map: + contexts_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + contexts_features[indexes[0]][:, indexes[1]] + ] + for indexes, func in self.contexts_items_features_map: + contexts_items_features[indexes[0]][:, :, indexes[1] : indexes[1] + 1] = func[ + contexts_items_features[indexes[0]][:, :, indexes[1]] + ] + + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self._return_items_features_tuple: + fixed_items_features = fixed_items_features[0] + if not self._return_contexts_features_tuple: + contexts_features = contexts_features[0] + # sessions_items_features were not given as a tuple, so we return do not return + # it as a tuple + if not self._return_contexts_items_features_tuple: + contexts_items_features = contexts_items_features[0] return ( - items_features, - sessions_features, - sessions_items_features, - sessions_items_availabilities, - choice, + fixed_items_features, + contexts_features, + contexts_items_features, + contexts_items_availabilities, + choices, ) - if isinstance(choice_index, slice): - return self.get_choice_batch(list(range(*choice_index.indices(self.choices.shape[0])))) + if isinstance(choices_indexes, slice): + return self.get_choice_batch( + list(range(*choices_indexes.indices(self.choices.shape[0]))) + ) - session_index = self.indexes[choice_index] - choice = self.choices[choice_index] + ### New + choice = self.choices[choices_indexes] - if self.items_features is None: - items_features = None + # fif = self.fif ? + if self.fixed_items_features is None: + fixed_items_features = None else: - items_features = tuple(items_feature for items_feature in self.items_features) + fixed_items_features = tuple( + items_feature for items_feature in self.fixed_items_features + ) - if self.sessions_features is None: - sessions_features = None + if self.contexts_features is None: + contexts_features = None else: - sessions_features = tuple( - sessions_feature[session_index] for sessions_feature in self.sessions_features + contexts_features = tuple( + contexts_feature[choices_indexes] for contexts_feature in self.contexts_features ) - if self.sessions_items_features is None: - sessions_items_features = None + if self.contexts_items_features is None: + contexts_items_features = None else: - sessions_items_features = tuple( - sessions_items_feature[session_index] - for sessions_items_feature in self.sessions_items_features + contexts_items_features = tuple( + contexts_items_feature[choices_indexes] + for contexts_items_feature in self.contexts_items_features ) - if self.sessions_items_availabilities is None: - sessions_items_availabilities = None + if self.contexts_items_availabilities is None: + contexts_items_availabilities = np.ones((self.base_num_items)) else: - sessions_items_availabilities = self.sessions_items_availabilities[session_index] - - return df_chosen_item, dfs_available_items + contexts_items_availabilities = self.contexts_items_availabilities[choices_indexes] + + for indexes, func in self.fixed_items_features_map: + fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + fixed_items_features[indexes[0]][:, indexes[1]] + ] + for indexes, func in self.contexts_features_map: + contexts_features[indexes[0]][indexes[1] : indexes[1] + 1] = func[ + contexts_features[indexes[0]][indexes[1]] + ] + for indexes, func in self.contexts_items_features_map: + contexts_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + contexts_items_features[indexes[0]][:, indexes[1]] + ] + + return ( + fixed_items_features, + contexts_features, + contexts_items_features, + contexts_items_availabilities, + choice, + ) def __getitem__(self, choices_indexes): """Method to create a sub-ChoiceDataset with only a subset of choices, from their indexes. From 101e402c4b2637bd01b72b617e28d1ee2e6e9c9d Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Thu, 11 Jan 2024 12:19:40 +0100 Subject: [PATCH 08/22] enhancing readme --- README.md | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 556f8eb4..4ab1d6e2 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ -Choice-Learn is a Python package designed to help you build with ease discrete choice models. +Choice-Learn is a Python package designed to help you build discrete choice models. The package provides ready to use datasets and different models from the litterature. It also provides a lower level use if you want to customize any model or create your own from scratch. In particular you will find smart datasets handling to limit RAM usage and different structure commons to any choice model. Choice-Learn uses NumPy and pandas as data backend engines and TensorFlow for models. @@ -32,11 +32,34 @@ This repository contains a private version of the package. ## What's in there ? +### Data +- Generic dataset handling with the ChoiceDataset class +- Ready-To-Use datasets: + - SwissMetro from Bierlaire et al. (2001) + - ModeCanada from Koppelman et al. (1993) + +### Models +- Ready to use models: + - Conditional MultiNomialLogit, Train, K.; McFadden, D.; Ben-Akiva, M. (1987) + - RUMnet, Aouad A.; Désir A. (2022) +- Ready to use models to be implemented: + - Nested MultiNomialLogit + - MultiNomialLogit with latent variables (MixedLogit) + - TasteNet + - SHOPPER + - Others ... +- Custom modelling is made easy by subclassing the ChoiceModel class + +### Different tools (to come) +- Standardization of evaluation protocols +- Assortment optimization from model +- Interfaces + ## Getting Started - Fast Track You can find the following notebooks to help you getting started with the package: - [Introduction to data management](notebooks/choice_learn_introduction_data.ipynb) -- [Introduction to modelling with the conditional logit model on ModeCanada dataaset](notebooks/choice_learn_introduction_clogit.ipynb) +- [Introduction to modelling with the conditional logit model on ModeCanada dataset](notebooks/choice_learn_introduction_clogit.ipynb) - [Introduction to custom modelling with the ModeCanada dataset](notebooks/custom_model.ipynb) ## Installation @@ -70,22 +93,8 @@ from choice_learn.models import ConditionalMNL ## Documentation -TODO: Github pages is not enabled by default, you need to enable it in the repository settings: Settings > Pages > Source: "Deploy from a branch" / Branch: "gh-pages" / Folder: "/(root)" - A detailed documentation of this project is available [here](https://artefactory.github.io/choice-learn-private/) -To serve the documentation locally, run the following command: - -```bash -mkdocs serve -``` - -To build it and deploy it to GitHub pages, run the following command: - -```bash -make deploy_docs -``` - ## Citation ### Contributors From 29630ab1c5b37866c659c4dd3a259aa8d8863858 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Fri, 12 Jan 2024 14:27:19 +0100 Subject: [PATCH 09/22] ENH some doc --- README.md | 2 +- mkdocs.yaml | 3 +++ requirements-developer.txt | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ab1d6e2..8a614595 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ Choice-Learn requires the following: ## Usage ```python from choice_learn.data import ChoiceDataset -from choice_learn.models import ConditionalMNL +from choice_learn.models import ConditionalMNL, RUMnet ``` ## Documentation diff --git a/mkdocs.yaml b/mkdocs.yaml index 8482bf83..305ba7ae 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -32,10 +32,13 @@ markdown_extensions: - pymdownx.inlinehilite - pymdownx.snippets - pymdownx.superfences + - mdx_math plugins: - mkdocstrings - search +extra_javascript: + - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS-MML_HTMLorMML nav: - Home: index.md diff --git a/requirements-developer.txt b/requirements-developer.txt index 20cf8a44..1937e301 100644 --- a/requirements-developer.txt +++ b/requirements-developer.txt @@ -5,6 +5,7 @@ pytest==7.3.2 mkdocs==1.4.3 mkdocs-material==9.1.15 mkdocstrings-python==1.1.2 +python-markdown-math bandit==1.7.5 nbstripout==0.6.1 ipykernel==6.24.0 From 238773956ff6c556c87b8df7c69f541a189f4eb6 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Fri, 12 Jan 2024 18:33:21 +0100 Subject: [PATCH 10/22] FIX: detailed features with pd.DataFrame --- choice_learn/data/choice_dataset.py | 31 ++++++++++++++++++----------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 686e1ddd..50a492f9 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -88,7 +88,7 @@ def __init__( if not isinstance(contexts_features, tuple): self._return_contexts_features_tuple = False if contexts_items_features_names is not None: - assert len(contexts_features) == len( + assert len(contexts_features[0]) == len( contexts_features_names ), "Number of features given does not match number of features names given." @@ -115,7 +115,7 @@ def __init__( self._return_contexts_items_features_tuple = False if contexts_items_features_names is not None: assert ( - len(contexts_items_features) == len(contexts_items_features_names) + len(contexts_items_features[0][0]) == len(contexts_items_features_names) ), "Number of features given does not match number of features names given for contexts_items." contexts_items_features = (contexts_items_features,) contexts_items_features_names = (contexts_items_features_names,) @@ -196,9 +196,11 @@ def __init__( if "session_id" in feature.columns: if "item_id" in feature.columns: feature_array = [] - for sess in np.sort(feature.session_id): + for sess in np.sort(feature.session_id.unique()): sess_df = feature.loc[feature.session_id == sess] - sess_df = sess_df.set_index("item_id") + sess_df = sess_df[ + sess_df.columns.difference(["sess_id"]) + ].set_index("item_id") feature_array.append(sess_df.loc[np.sort(sess_df.index)].to_numpy()) contexts_items_features = ( contexts_items_features[:i] @@ -212,12 +214,16 @@ def __init__( + (feature.loc[np.sort(feature.index)].to_numpy(),) + contexts_items_features[i + 1 :] ) + else: + raise ValueError("session_id column not found in contexts_items_features") elif isinstance(feature, list): contexts_items_features = ( contexts_items_features[:i] + (np.array(feature),) + contexts_items_features[i + 1 :] ) + print(contexts_items_features) + print(contexts_items_features[0].shape) if contexts_items_availabilities is not None: if isinstance(contexts_items_availabilities, list): contexts_items_availabilities = np.array( @@ -265,6 +271,7 @@ def __init__( self.contexts_features_names = contexts_features_names self.contexts_items_features_names = contexts_items_features_names + # What about typing ? should builf after check to change it ? self._build_features_by_ids() self.n_choices = len(self.choices) @@ -272,7 +279,7 @@ def __init__( # Different consitency checks to ensure everything is coherent self._check_dataset() # Should handle alone if np.arrays are squeezed self._return_types = self._check_types() - # self._check_names() + self._check_names() # Build .iloc method self.indexer = ChoiceDatasetIndexer(self) @@ -473,8 +480,8 @@ def _check_types(self): def _check_names(self): """Verifies that the names given to features are consistent with the features themselves.""" - if self.items_features_names is not None: - for name, features in zip(self.items_features_names, self.items_features): + if self.fixed_items_features_names is not None: + for name, features in zip(self.fixed_items_features_names, self.fixed_items_features): if name is not None: if len(name) != features.shape[1]: raise ValueError( @@ -482,8 +489,8 @@ def _check_names(self): length {len(name)} while items_features has {features.shape[1]} elements" ) - if self.sessions_features_names is not None: - for name, features in zip(self.sessions_features_names, self.sessions_features): + if self.contexts_features_names is not None: + for name, features in zip(self.contexts_features_names, self.contexts_features): if name is not None: if len(name) != features.shape[1]: raise ValueError( @@ -491,17 +498,17 @@ def _check_names(self): length {len(name)} while sessions_features has {features.shape[1]} elements" ) - if self.sessions_items_features_names is not None: + if self.contexts_items_features_names is not None: for ( name, features, - ) in zip(self.sessions_items_features_names, self.sessions_items_features): + ) in zip(self.contexts_items_features_names, self.contexts_items_features): if name is not None: if len(name) != features.shape[2]: raise ValueError( f"Specified \ sessions_items_features_names has length {len(name)} while \ - sessions_items_features has {features.shape[1]} elements" + sessions_items_features has {features.shape[2]} elements" ) def __len__(self): From 290de081ecbdd07720b6bf8c8c0c7e5b4cae027c Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Fri, 12 Jan 2024 18:49:23 +0100 Subject: [PATCH 11/22] FIX: summary df --- choice_learn/data/choice_dataset.py | 54 ++++++++++++++++++----------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 50a492f9..cd5bb341 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -199,7 +199,7 @@ def __init__( for sess in np.sort(feature.session_id.unique()): sess_df = feature.loc[feature.session_id == sess] sess_df = sess_df[ - sess_df.columns.difference(["sess_id"]) + sess_df.columns.difference(["session_id"]) ].set_index("item_id") feature_array.append(sess_df.loc[np.sort(sess_df.index)].to_numpy()) contexts_items_features = ( @@ -207,6 +207,11 @@ def __init__( + (np.stack(feature_array, axis=0),) + contexts_items_features[i + 1 :] ) + contexts_items_features_names = ( + contexts_items_features_names[:i] + + (sess_df.columns,) + + contexts_items_features_names[i + 1 :] + ) else: feature = feature.set_index("session_id") contexts_items_features = ( @@ -214,6 +219,11 @@ def __init__( + (feature.loc[np.sort(feature.index)].to_numpy(),) + contexts_items_features[i + 1 :] ) + contexts_items_features_names = ( + contexts_items_features_names[:i] + + (feature.columns,) + + contexts_items_features_names[i + 1 :] + ) else: raise ValueError("session_id column not found in contexts_items_features") elif isinstance(feature, list): @@ -222,8 +232,6 @@ def __init__( + (np.array(feature),) + contexts_items_features[i + 1 :] ) - print(contexts_items_features) - print(contexts_items_features[0].shape) if contexts_items_availabilities is not None: if isinstance(contexts_items_availabilities, list): contexts_items_availabilities = np.array( @@ -728,39 +736,43 @@ def save(self): def summary(self): """Method to display a summary of the dataset.""" - print("Summary of the dataset:") + print("%=====================================================================%") + print("%%% Summary of the dataset:") + print("%=====================================================================%") print("Number of items:", self.get_num_items()) - print("Number of sessions:", self.get_num_sessions()) print( "Number of choices:", - self.get_num_choices(), - "Averaging", - self.get_num_choices() / self.get_num_sessions(), - "choices per session", + len(self), ) - if self.items_features is not None: - print(f"Items features: {self.items_features_names}") - if self.items_features is not None: - print(f"{sum([f.shape[1] for f in self.items_features])} items features") + if self.fixed_items_features is not None: + print(f"Fixed Items Features:") + print(f"{sum([f.shape[1] for f in self.fixed_items_features])} items features") + if self.fixed_items_features_names is not None: + print(f"with names: {self.fixed_items_features_names}") else: print("No items features registered") + print("\n") - if self.sessions_features is not None: - print(f"Sessions features: {self.sessions_features_names}") - if self.sessions_features is not None: - print(f"{sum([f.shape[1] for f in self.sessions_features])} session features") + if self.contexts_features is not None: + print(f"Sessions features:") + print(f"{sum([f.shape[1] for f in self.contexts_features])} session features") + if self.contexts_features_names is not None: + print(f"with names: {self.contexts_features_names}") else: print("No sessions features registered") + print("\n") - if self.sessions_items_features is not None: - print(f"Session Items features: {self.sessions_items_features_names}") - if self.sessions_items_features is not None: + if self.contexts_items_features is not None: + print(f"Session Items features:") print( - f"{sum([f.shape[2] for f in self.sessions_items_features])} sessions \ + f"{sum([f.shape[2] for f in self.contexts_items_features])} sessions \ items features" ) + if self.contexts_items_features_names is not None: + print(f"with names: {self.contexts_items_features_names}") else: print("No sessions items features registered") + print("%=====================================================================%") def get_choices_batch(self, choices_indexes, features=None): """Method to access data within the ListChoiceDataset from its index. From 7c2a0679528062de01c44b5f7e0aee9ce68b9654 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Sat, 13 Jan 2024 00:20:33 +0100 Subject: [PATCH 12/22] ADD: working indexer --- choice_learn/data/choice_dataset.py | 257 +++++++++++++--------------- choice_learn/data/indexer.py | 192 +++++++++++++-------- 2 files changed, 236 insertions(+), 213 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index cd5bb341..8e2660e2 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -100,7 +100,7 @@ def __init__( self._return_contexts_features_tuple = True if contexts_features_names is not None: for f, name in zip(contexts_features, contexts_features_names): - if len(f) != len(name): + if len(f[0]) != len(name): raise ValueError( "contexts_features shape and contexts_features_names shape do not match" ) @@ -123,10 +123,9 @@ def __init__( # sessions_items_features is already a tuple, names are given, checking consistency elif contexts_items_features is not None and contexts_items_features_names is not None: for f, name in zip(contexts_items_features, contexts_items_features_names): - if len(f) != len(name): + if len(f[0][0]) != len(name): raise ValueError( - "contexts_items_features shape and \ - contexts_items_features_names shape do not match" + "contexts_items_features shape and contexts_items_features_names shape do not match" ) self._return_contexts_items_features_tuple = True # In this case names are missing, still transform it as a tuple @@ -280,7 +279,11 @@ def __init__( self.contexts_items_features_names = contexts_items_features_names # What about typing ? should builf after check to change it ? - self._build_features_by_ids() + ( + self.fixed_items_features_map, + self.contexts_features_map, + self.contexts_items_features_map, + ) = self._build_features_by_ids() self.n_choices = len(self.choices) @@ -295,7 +298,7 @@ def __init__( def _build_features_by_ids(self): if len(self.features_by_ids) == 0: print("No features_by_ids given.") - return + return [], [], [] if ( self.fixed_items_features_names is None @@ -338,9 +341,7 @@ def _build_features_by_ids(self): contexts_items_features_map ) == len(self.features_by_ids), "Some features_by_ids were not matched with features_names." - self.fixed_items_features_map = fixed_items_features_map - self.contexts_features_map = contexts_features_map - self.contexts_items_features_map = contexts_items_features_map + return fixed_items_features_map, contexts_features_map, contexts_items_features_map def _check_dataset(self): """Verifies that the shapes of the different features are consistent. @@ -552,14 +553,14 @@ def get_num_choices(self): return len(self) @classmethod - def _sessions_items_features_df_to_np( + def _contexts_items_features_df_to_np( cls, df, items_index, - sessions_index, + contexts_index, features, items_id_column="item_id", - sessions_id_column="session_id", + contexts_id_column="session_id", ): """Builds sessions_items_features and sessions_items_availabilities from dataframe. @@ -590,17 +591,17 @@ def _sessions_items_features_df_to_np( except ValueError: pass - sessions_items_features = [] - sessions_items_availabilities = [] - for sess in sessions_index: - sess_df = df.loc[df[sessions_id_column] == sess] + contexts_items_features = [] + contexts_items_availabilities = [] + for sess in contexts_index: + sess_df = df.loc[df[contexts_id_column] == sess] if len(sess_df) == len(items_index): sess_df = sess_df.T sess_df.columns = sess_df.loc[items_id_column] if features is not None: - sessions_items_features.append(sess_df[items_index].loc[features].T.values) - sessions_items_availabilities.append(np.ones(len(items_index))) + contexts_items_features.append(sess_df[items_index].loc[features].T.values) + contexts_items_availabilities.append(np.ones(len(items_index))) else: sess_feats = [] sess_av = [] @@ -614,24 +615,24 @@ def _sessions_items_features_df_to_np( if features is not None: sess_feats.append(np.zeros(len(features))) sess_av.append(0) - sessions_items_features.append(sess_feats) - sessions_items_availabilities.append(sess_av) + contexts_items_features.append(sess_feats) + contexts_items_availabilities.append(sess_av) if features is not None: - sessions_items_features = (np.array(sessions_items_features),) + sessions_items_features = (np.array(contexts_items_features),) else: sessions_items_features = None - return sessions_items_features, np.array(sessions_items_availabilities) + return sessions_items_features, np.array(contexts_items_availabilities) @classmethod def from_single_df( cls, df, - items_features_columns, - sessions_features_columns, - sessions_items_features_columns, + fixed_items_features_columns, + contexts_features_columns, + contexts_items_features_columns, items_id_column="item_id", - sessions_id_column="session_id", + contexts_id_column="context_id", choices_column="choice", choice_mode="items_name", ): @@ -661,55 +662,55 @@ def from_single_df( """ # Ordering items and sessions by id items = np.sort(df[items_id_column].unique()) - sessions = np.sort(df[sessions_id_column].unique()) + sessions = np.sort(df[contexts_id_column].unique()) - if items_features_columns is not None: - items_features = df[items_features_columns + [items_id_column]].drop_duplicates() + if fixed_items_features_columns is not None: + items_features = df[fixed_items_features_columns + [items_id_column]].drop_duplicates() items_features = items_features.set_index(items_id_column) items_features = (items_features.loc[items].to_numpy(),) - items_features_columns = (items_features_columns,) + items_features_columns = (fixed_items_features_columns,) else: items_features = None - if sessions_features_columns is not None: - sessions_features = df[ - sessions_features_columns + [sessions_id_column] + if contexts_features_columns is not None: + contexts_features = df[ + contexts_features_columns + [contexts_id_column] ].drop_duplicates() - sessions_features = sessions_features.set_index(sessions_id_column) - sessions_features = (sessions_features.loc[sessions].to_numpy(),) + contexts_features = contexts_features.set_index(contexts_id_column) + contexts_features = (contexts_features.loc[sessions].to_numpy(),) - sessions_features_columns = (sessions_features_columns,) + contexts_features_columns = (contexts_features_columns,) else: - sessions_features = None + contexts_features = None ( - sessions_items_features, - sessions_items_availabilities, - ) = cls._sessions_items_features_df_to_np( + contexts_items_features, + contexts_items_availabilities, + ) = cls._contexts_items_features_df_to_np( df, items_index=items, - sessions_index=sessions, - features=sessions_items_features_columns, + contexts_index=sessions, + features=contexts_items_features_columns, items_id_column=items_id_column, - sessions_id_column=sessions_id_column, + contexts_id_column=contexts_id_column, ) - sessions_items_features_columns = ( - (sessions_items_features_columns,) - if sessions_items_features_columns is not None + contexts_items_features_columns = ( + (contexts_items_features_columns,) + if contexts_items_features_columns is not None else None ) if choice_mode == "item_id": - choices = df[[choices_column, sessions_id_column]].drop_duplicates(sessions_id_column) - choices = choices.set_index(sessions_id_column) + choices = df[[choices_column, contexts_id_column]].drop_duplicates(contexts_id_column) + choices = choices.set_index(contexts_id_column) choices = choices.loc[sessions].to_numpy() # items is the value (str) of the item choices = [np.where(items == c)[0] for c in choices] elif choice_mode == "one_zero": - choices = df[[items_id_column, choices_column, sessions_id_column]] + choices = df[[items_id_column, choices_column, contexts_id_column]] choices = choices.loc[choices[choices_column] == 1] - choices = choices = choices.set_index(sessions_id_column) + choices = choices = choices.set_index(contexts_id_column) choices = ( choices.loc[sessions][items_id_column] .map({k: v for v, k in enumerate(items)}) @@ -720,14 +721,14 @@ def from_single_df( f"choice_mode {choice_mode} not recognized. Must be in ['item_id', 'one_zero']" ) return ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, - items_features_names=items_features_columns, - sessions_features_names=sessions_features_columns, - sessions_items_features_names=sessions_items_features_columns, + fixed_items_features_names=items_features_columns, + contexts_features_names=contexts_features_columns, + contexts_items_features_names=contexts_items_features_columns, ) def save(self): @@ -745,7 +746,7 @@ def summary(self): len(self), ) if self.fixed_items_features is not None: - print(f"Fixed Items Features:") + print("Fixed Items Features:") print(f"{sum([f.shape[1] for f in self.fixed_items_features])} items features") if self.fixed_items_features_names is not None: print(f"with names: {self.fixed_items_features_names}") @@ -754,7 +755,7 @@ def summary(self): print("\n") if self.contexts_features is not None: - print(f"Sessions features:") + print("Sessions features:") print(f"{sum([f.shape[1] for f in self.contexts_features])} session features") if self.contexts_features_names is not None: print(f"with names: {self.contexts_features_names}") @@ -763,7 +764,7 @@ def summary(self): print("\n") if self.contexts_items_features is not None: - print(f"Session Items features:") + print("Session Items features:") print( f"{sum([f.shape[2] for f in self.contexts_items_features])} sessions \ items features" @@ -790,11 +791,10 @@ def get_choices_batch(self, choices_indexes, features=None): """ if isinstance(choices_indexes, list): - print(choices_indexes) if self.fixed_items_features is None: fixed_items_features = None else: - fixed_items_features = tuple( + fixed_items_features = list( items_feature # .astype(self._return_types[0][i]) for i, items_feature in enumerate(self.fixed_items_features) @@ -803,7 +803,7 @@ def get_choices_batch(self, choices_indexes, features=None): if self.contexts_features is None: contexts_features = None else: - contexts_features = tuple( + contexts_features = list( contexts_features[choices_indexes] # .astype(self._return_types[1][i]) for i, contexts_features in enumerate(self.contexts_features) @@ -814,7 +814,7 @@ def get_choices_batch(self, choices_indexes, features=None): if self.contexts_items_features is None: contexts_items_features = None else: - contexts_items_features = tuple( + contexts_items_features = list( contexts_items_feature[choices_indexes] # .astype(self._return_types[2][i]) for i, contexts_items_feature in enumerate(self.contexts_items_features) @@ -829,26 +829,53 @@ def get_choices_batch(self, choices_indexes, features=None): choices = self.choices[choices_indexes].astype(self._return_types[4]) for indexes, func in self.fixed_items_features_map: - fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ - fixed_items_features[indexes[0]][:, indexes[1]] - ] + fixed_items_features[indexes[0]] = np.concatenate( + [ + fixed_items_features[indexes[0]][:, : indexes[1]], + func[fixed_items_features[indexes[0]][:, indexes[1]]], + fixed_items_features[indexes[0]][:, indexes[1] + 1 :], + ], + axis=1, + ) for indexes, func in self.contexts_features_map: - contexts_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ - contexts_features[indexes[0]][:, indexes[1]] - ] + contexts_features[indexes[0]] = np.concatenate( + [ + contexts_features[indexes[0]][:, : indexes[1]], + func[contexts_features[indexes[0]][:, indexes[1]]], + contexts_features[indexes[0]][:, indexes[1] + 1 :], + ], + axis=1, + ) for indexes, func in self.contexts_items_features_map: contexts_items_features[indexes[0]][:, :, indexes[1] : indexes[1] + 1] = func[ contexts_items_features[indexes[0]][:, :, indexes[1]] ] + contexts_items_features[indexes[0]] = np.concatenate( + [ + contexts_items_features[indexes[0]][:, :, : indexes[1]], + func[contexts_items_features[indexes[0]][:, :, indexes[1]]], + contexts_items_features[indexes[0]][:, :, indexes[1] + 1 :], + ], + axis=2, + ) # items_features were not given as a tuple, so we return do not return it as a tuple - if not self._return_items_features_tuple: + if self._return_items_features_tuple and self.fixed_items_features is not None: + fixed_items_features = tuple(fixed_items_features) + elif self.fixed_items_features is not None: fixed_items_features = fixed_items_features[0] - if not self._return_contexts_features_tuple: + if self._return_contexts_features_tuple and self.contexts_features is not None: + contexts_features = tuple(contexts_features) + elif self.contexts_features is not None: contexts_features = contexts_features[0] # sessions_items_features were not given as a tuple, so we return do not return # it as a tuple - if not self._return_contexts_items_features_tuple: + if ( + self._return_contexts_items_features_tuple + and self.contexts_items_features is not None + ): + contexts_items_features = tuple(contexts_items_features) + elif self.contexts_items_features is not None: contexts_items_features = contexts_items_features[0] return ( @@ -929,73 +956,29 @@ def __getitem__(self, choices_indexes): ChoiceDataset ChoiceDataset with only the sessions indexed by indexes """ - if isinstance(session_indexes, int): - session_indexes = [session_indexes] - elif isinstance(session_indexes, slice): - return self.__getitem__(list(range(*session_indexes.indices(len(self.ragged_choices))))) + if isinstance(choices_indexes, int): + choices_indexes = [choices_indexes] + elif isinstance(choices_indexes, slice): + return self.__getitem__(list(range(*choices_indexes.indices(len(self.choices))))) return ChoiceDataset( - items_features=self.items_features, - sessions_features=tuple( - self.sessions_features[i][session_indexes] - for i in range(len(self.sessions_features)) + fixed_items_features=self.fixed_items_features, + contexts_features=tuple( + self.contexts_features[i][choices_indexes] + for i in range(len(self.contexts_features)) ), - sessions_items_features=tuple( - self.sessions_items_features[i][session_indexes] - for i in range(len(self.sessions_items_features)) + contexts_items_features=tuple( + self.contexts_items_features[i][choices_indexes] + for i in range(len(self.contexts_items_features)) ), - sessions_items_availabilities=self.sessions_items_availabilities[session_indexes], - choices=[self.ragged_choices[i] for i in session_indexes], - items_features_names=self.items_features_names, - sessions_features_names=self.sessions_features_names, - sessions_items_features_names=self.sessions_items_features_names, + contexts_items_availabilities=self.contexts_items_availabilities[choices_indexes], + choices=[self.choices[i] for i in choices_indexes], + fixed_items_features_names=self.fixed_items_features_names, + contexts_features_names=self.contexts_features_names, + contexts_items_features_names=self.contexts_items_features_names, + features_by_ids=self.features_by_ids, ) - def old_batch(self, batch_size, shuffle=None, sample_weight=None): - """Iterates over dataset return batches of length batch_size. - - Parameters - ---------- - batch_size : int - batch size to set - shuffle: bool - Whether or not to shuffle the dataset - sample_weight : Iterable - list of weights to be returned with the right indexing during the shuffling - """ - - if shuffle is None: - shuffle = self.shuffle - if batch_size == -1: - batch_size = self.get_num_choices() - - # Get indexes for each choice - num_choices = self.get_num_choices() - indexes = np.arange(num_choices) - # Shuffle indexes - if shuffle and not batch_size == -1: - indexes = np.random.permutation(indexes) - - yielded_size = 0 - while yielded_size < num_choices: - # Return sample_weight if not None, for index matching - if sample_weight is not None: - yield ( - self.get_choice_batch( - indexes[yielded_size : yielded_size + batch_size].tolist() - ), - sample_weight[indexes[yielded_size : yielded_size + batch_size].tolist()], - ) - else: - yield self.get_choice_batch( - indexes[yielded_size : yielded_size + batch_size].tolist() - ) - yielded_size += batch_size - - # Special exit strategy for batch_size = -1 - if batch_size == -1: - yielded_size += 2 * num_choices - @property def batch(self): """Indexer.""" @@ -1015,14 +998,12 @@ def iter_batch(self, batch_size, shuffle=None, sample_weight=None): sample_weight : Iterable list of weights to be returned with the right indexing during the shuffling """ - if shuffle is None: shuffle = self.shuffle if batch_size == -1: - batch_size = self.get_num_choices() - + batch_size = len(self) # Get indexes for each choice - num_choices = self.get_num_choices() + num_choices = len(self) indexes = np.arange(num_choices) # Shuffle indexes if shuffle and not batch_size == -1: diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 2c8568b1..723ec7d0 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -154,7 +154,7 @@ def __init__(self, choice_dataset): """ self.choice_dataset = choice_dataset - def _get_items_features(self): + def _get_fixed_items_features(self): """Method to access items features of the ChoiceDataset. Returns: @@ -162,20 +162,20 @@ def _get_items_features(self): tuple of np.ndarray or np.ndarray items_features of the ChoiceDataset """ - if self.choice_dataset.items_features is None: + if self.choice_dataset.fixed_items_features is None: items_features = None else: items_features = tuple( items_feature.astype(self.choice_dataset._return_types[0][i]) - for i, items_feature in enumerate(self.choice_dataset.items_features) + for i, items_feature in enumerate(self.choice_dataset.fixed_items_features) ) # items_features were not given as a tuple, so we return do not return it as a tuple - if not self.choice_dataset._return_items_features_tuple: - items_features = items_features[0] + # if not self.choice_dataset._return_items_features_tuple: + # items_features = items_features[0] return items_features - def _get_sessions_features(self, sessions_indexes): + def _get_contexts_features(self, contexts_indexes): """Method to access sessions features of the ChoiceDataset. Parameters @@ -188,31 +188,31 @@ def _get_sessions_features(self, sessions_indexes): tuple of np.ndarray or np.ndarray items_features of the ChoiceDataset """ - if self.choice_dataset.sessions_features is None: - sessions_features = None + if self.choice_dataset.contexts_features is None: + contexts_features = None else: - sessions_features = [] - for i, sessions_feature in enumerate(self.choice_dataset.sessions_features): - if hasattr(sessions_feature, "iloc"): - sessions_features.append( - sessions_feature.iloc[sessions_indexes].astype( + contexts_features = [] + for i, contexts_feature in enumerate(self.choice_dataset.contexts_features): + if hasattr(contexts_feature, "batch"): + contexts_features.append( + contexts_feature.batch[contexts_indexes].astype( self.choice_dataset._return_types[1][i] ) ) else: - sessions_features.append( - np.stack(sessions_feature[sessions_indexes], axis=0).astype( + contexts_features.append( + np.stack(contexts_feature[contexts_indexes], axis=0).astype( self.choice_dataset._return_types[1][i] ) ) # sessions_features were not given as a tuple, so we return do not return it as a tuple - if not self.choice_dataset._return_sessions_features_tuple: - sessions_features = sessions_features[0] - else: - sessions_features = tuple(sessions_features) - return sessions_features + # if not self.choice_dataset._return_contexts_features_tuple: + # contexts_features = contexts_feature[0] + # else: + # contexts_features = tuple(contexts_features) + return contexts_features - def _get_sessions_items_features(self, sessions_indexes): + def _get_contexts_items_features(self, contexts_indexes): """Method to access sessions items features of the ChoiceDataset. Parameters @@ -225,28 +225,28 @@ def _get_sessions_items_features(self, sessions_indexes): tuple of np.ndarray or np.ndarray items_features of the ChoiceDataset """ - if self.choice_dataset.sessions_items_features is None: + if self.choice_dataset.contexts_items_features is None: return None - sessions_items_features = [] - for i, sessions_items_feature in enumerate(self.choice_dataset.sessions_items_features): - if hasattr(sessions_items_feature, "iloc"): - sessions_items_features.append( - sessions_items_feature.iloc[sessions_indexes].astype(self._return_types[2][i]) + contexts_items_features = [] + for i, contexts_items_feature in enumerate(self.choice_dataset.contexts_items_features): + if hasattr(contexts_items_feature, "iloc"): + contexts_items_features.append( + contexts_items_feature.iloc[contexts_indexes].astype(self._return_types[2][i]) ) else: - sessions_items_features.append( - np.stack(sessions_items_feature[sessions_indexes], axis=0).astype( + contexts_items_features.append( + np.stack(contexts_items_feature[contexts_indexes], axis=0).astype( self.choice_dataset._return_types[2][i] ) ) # sessions_items_features were not given as a tuple, thus we do not return it as a tuple - if self.choice_dataset._return_sessions_items_features_tuple: - sessions_items_features = tuple(sessions_items_features) - else: - sessions_items_features = sessions_items_features[0] - return sessions_items_features + # if self.choice_dataset._return_contexts_items_features_tuple: + # contexts_items_features = tuple(contexts_items_features) + # else: + # contexts_items_features = contexts_items_features[0] + return contexts_items_features - def __getitem__(self, choice_index): + def __getitem__(self, choices_indexes): """Method to access data within the ChoiceDataset from its index. One index corresponds to a choice within a session. @@ -263,80 +263,122 @@ def __getitem__(self, choice_index): indexes of the choices (that will be mapped to choice & session indexes) to return """ - if isinstance(choice_index, list): - items_features = self._get_items_features() - # Get the session indexes - sessions_indexes = [self.choice_dataset.indexes[i] for i in choice_index] + if isinstance(choices_indexes, list): + fixed_items_features = self._get_fixed_items_features() - sessions_features = self._get_sessions_features(sessions_indexes) - sessions_items_features = self._get_sessions_items_features(sessions_indexes) + # Get the session indexes + contexts_features = self._get_contexts_features(choices_indexes) + contexts_items_features = self._get_contexts_items_features(choices_indexes) - if self.choice_dataset.sessions_items_availabilities is None: - sessions_items_availabilities = None + if self.choice_dataset.contexts_items_availabilities is None: + contexts_items_availabilities = None else: - if hasattr(self.choice_dataset.sessions_items_availabilities, "iloc"): - sessions_items_availabilities = ( - self.choice_dataset.sessions_items_availabilities.iloc[ - sessions_indexes + if hasattr(self.choice_dataset.contexts_items_availabilities, "batch"): + contexts_items_availabilities = ( + self.choice_dataset.contexts_items_availabilities.batch[ + choices_indexes ].astype(self.choice_dataset._return_types[3]) ) else: - sessions_items_availabilities = ( - self.choice_dataset.sessions_items_availabilities[sessions_indexes].astype( + contexts_items_availabilities = ( + self.choice_dataset.contexts_items_availabilities[choices_indexes].astype( self.choice_dataset._return_types[3] ) ) - choice = self.choice_dataset.choices[choice_index].astype( + for indexes, func in self.choice_dataset.fixed_items_features_map: + fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + fixed_items_features[indexes[0]][:, indexes[1]] + ] + for indexes, func in self.choice_dataset.contexts_features_map: + contexts_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + contexts_features[indexes[0]][:, indexes[1]] + ] + for indexes, func in self.choice_dataset.contexts_items_features_map: + contexts_items_features[indexes[0]][:, :, indexes[1] : indexes[1] + 1] = func[ + contexts_items_features[indexes[0]][:, :, indexes[1]] + ] + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self.choice_dataset._return_items_features_tuple: + fixed_items_features = fixed_items_features[0] + if not self.choice_dataset._return_contexts_features_tuple: + contexts_features = contexts_features[0] + # sessions_items_features were not given as a tuple, so we return do not return + # it as a tuple + if not self.choice_dataset._return_contexts_items_features_tuple: + contexts_items_features = contexts_items_features[0] + + choices = self.choice_dataset.choices[choices_indexes].astype( self.choice_dataset._return_types[4] ) return ( - items_features, - sessions_features, - sessions_items_features, - sessions_items_availabilities, - choice, + fixed_items_features, + contexts_features, + contexts_items_features, + contexts_items_availabilities, + choices, ) - if isinstance(choice_index, slice): + if isinstance(choices_indexes, slice): return self.__getitem__( - list(range(*choice_index.indices(self.choice_dataset.choices.shape[0]))) + list(range(*choices_indexes.indices(self.choice_dataset.choices.shape[0]))) ) - if isinstance(choice_index, int): - items_features = self._get_items_features() + if isinstance(choices_indexes, int): + fixed_items_features = self._get_fixed_items_features() # Get the session indexes - sessions_indexes = self.choice_dataset.indexes[choice_index] - sessions_features = self._get_sessions_features(sessions_indexes) - sessions_items_features = self._get_sessions_items_features(sessions_indexes) + contexts_features = self._get_contexts_features(choices_indexes) + contexts_items_features = self._get_contexts_items_features(choices_indexes) - if self.choice_dataset.sessions_items_availabilities is None: - sessions_items_availabilities = None + if self.choice_dataset.contexts_items_availabilities is None: + contexts_items_availabilities = None else: - if hasattr(self.choice_dataset.sessions_items_availabilities, "iloc"): - sessions_items_availabilities = ( - self.choice_dataset.sessions_items_availabilities.iloc[ - sessions_indexes + if hasattr(self.choice_dataset.contexts_items_availabilities, "batch"): + contexts_items_availabilities = ( + self.choice_dataset.contexts_items_availabilities.iloc[ + choices_indexes ].astype(self.choice_dataset._return_types[3]) ) else: - sessions_items_availabilities = ( - self.choice_dataset.sessions_items_availabilities[sessions_indexes].astype( + contexts_items_availabilities = ( + self.choice_dataset.contexts_items_availabilities[choices_indexes].astype( self.choice_dataset._return_types[3] ) ) + for indexes, func in self.choice_dataset.fixed_items_features_map: + fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + fixed_items_features[indexes[0]][:, indexes[1]] + ] + for indexes, func in self.choice_dataset.contexts_features_map: + contexts_features[indexes[0]][indexes[1] : indexes[1] + 1] = func[ + contexts_features[indexes[0]][indexes[1]] + ] + for indexes, func in self.choice_dataset.contexts_items_features_map: + contexts_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + contexts_items_features[indexes[0]][:, indexes[1]] + ] - choice = self.choice_dataset.choices[choice_index].astype( + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self.choice_dataset._return_items_features_tuple: + fixed_items_features = fixed_items_features[0] + if not self.choice_dataset._return_contexts_features_tuple: + contexts_features = contexts_features[0] + # sessions_items_features were not given as a tuple, so we return do not return + # it as a tuple + if not self.choice_dataset._return_contexts_items_features_tuple: + contexts_items_features = contexts_items_features[0] + + choice = self.choice_dataset.choices[choices_indexes].astype( self.choice_dataset._return_types[4] ) return ( - items_features, - sessions_features, - sessions_items_features, - sessions_items_availabilities, + fixed_items_features, + contexts_features, + contexts_items_features, + contexts_items_availabilities, choice, ) raise NotImplementedError From ab508b37148d321b500e3f3971d827b8c998f649 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Sat, 13 Jan 2024 19:57:31 +0100 Subject: [PATCH 13/22] ADD: fully functional new signature --- choice_learn/data/choice_dataset.py | 95 +++++++++++++++---------- choice_learn/data/indexer.py | 103 +++++++++++++++++++--------- choice_learn/data/storage.py | 23 +++++-- 3 files changed, 145 insertions(+), 76 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 8e2660e2..a98577b6 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -4,7 +4,6 @@ import pandas as pd from choice_learn.data.indexer import ChoiceDatasetIndexer -from choice_learn.data.store import Store class ChoiceDataset(object): @@ -319,7 +318,7 @@ def _build_features_by_ids(self): for j, column_name in enumerate(feature): for feature_by_id in self.features_by_ids: if column_name == feature_by_id.name: - fixed_items_features_map.append(((i, j), feature_by_id.batch)) + fixed_items_features_map.append(((i, j), feature_by_id)) if self.contexts_features_names is not None: for i, feature in enumerate(self.contexts_features_names): @@ -327,7 +326,7 @@ def _build_features_by_ids(self): for j, column_name in enumerate(feature): for feature_by_id in self.features_by_ids: if column_name == feature_by_id.name: - contexts_features_map.append(((i, j), feature_by_id.batch)) + contexts_features_map.append(((i, j), feature_by_id)) if self.contexts_items_features_names is not None: for i, feature in enumerate(self.contexts_items_features_names): @@ -335,7 +334,7 @@ def _build_features_by_ids(self): for k, column_name in enumerate(feature): for feature_by_id in self.features_by_ids: if column_name == feature_by_id.name: - contexts_items_features_map.append(((i, j), feature_by_id.batch)) + contexts_items_features_map.append(((i, j), feature_by_id)) assert len(fixed_items_features_map) + len(contexts_features_map) + len( contexts_items_features_map @@ -463,6 +462,10 @@ def _check_types(self): item_types.append(np.int32) else: item_types.append(np.float32) + + for indexes, f_by_id in self.fixed_items_features_map: + sample_dtype = f_by_id.get_storage_types() + item_types[indexes[0]] = sample_dtype return_types.append(tuple(item_types)) session_types = [] @@ -472,6 +475,9 @@ def _check_types(self): session_types.append(np.int32) else: session_types.append(np.float32) + for indexes, f_by_id in self.contexts_features_map: + sample_dtype = f_by_id.get_storage_type() + session_types[indexes[0]] = sample_dtype return_types.append(tuple(session_types)) session_item_types = [] @@ -481,6 +487,9 @@ def _check_types(self): session_item_types.append(np.int32) else: session_item_types.append(np.float32) + for indexes, f_by_id in self.contexts_items_features_map: + sample_dtype = f_by_id.get_storage_types() + session_item_types[indexes[0]] = sample_dtype return_types.append(tuple(session_item_types)) return_types.append(np.float32) return_types.append(np.int32) @@ -828,55 +837,67 @@ def get_choices_batch(self, choices_indexes, features=None): choices = self.choices[choices_indexes].astype(self._return_types[4]) - for indexes, func in self.fixed_items_features_map: + for indexes, f_by_id in self.fixed_items_features_map: fixed_items_features[indexes[0]] = np.concatenate( [ fixed_items_features[indexes[0]][:, : indexes[1]], - func[fixed_items_features[indexes[0]][:, indexes[1]]], + f_by_id.batch[fixed_items_features[indexes[0]][:, indexes[1]]], fixed_items_features[indexes[0]][:, indexes[1] + 1 :], ], axis=1, ) - for indexes, func in self.contexts_features_map: + for indexes, f_by_id in self.contexts_features_map: contexts_features[indexes[0]] = np.concatenate( [ contexts_features[indexes[0]][:, : indexes[1]], - func[contexts_features[indexes[0]][:, indexes[1]]], + f_by_id.batch[contexts_features[indexes[0]][:, indexes[1]]], contexts_features[indexes[0]][:, indexes[1] + 1 :], ], axis=1, ) - for indexes, func in self.contexts_items_features_map: - contexts_items_features[indexes[0]][:, :, indexes[1] : indexes[1] + 1] = func[ - contexts_items_features[indexes[0]][:, :, indexes[1]] - ] + for indexes, f_by_id in self.contexts_items_features_map: + contexts_items_features[indexes[0]][ + :, :, indexes[1] : indexes[1] + 1 + ] = f_by_id.batch[contexts_items_features[indexes[0]][:, :, indexes[1]]] contexts_items_features[indexes[0]] = np.concatenate( [ contexts_items_features[indexes[0]][:, :, : indexes[1]], - func[contexts_items_features[indexes[0]][:, :, indexes[1]]], + f_by_id.batch[contexts_items_features[indexes[0]][:, :, indexes[1]]], contexts_items_features[indexes[0]][:, :, indexes[1] + 1 :], ], axis=2, ) - # items_features were not given as a tuple, so we return do not return it as a tuple - if self._return_items_features_tuple and self.fixed_items_features is not None: - fixed_items_features = tuple(fixed_items_features) - elif self.fixed_items_features is not None: - fixed_items_features = fixed_items_features[0] - if self._return_contexts_features_tuple and self.contexts_features is not None: - contexts_features = tuple(contexts_features) - elif self.contexts_features is not None: - contexts_features = contexts_features[0] - # sessions_items_features were not given as a tuple, so we return do not return - # it as a tuple - if ( - self._return_contexts_items_features_tuple - and self.contexts_items_features is not None - ): - contexts_items_features = tuple(contexts_items_features) - elif self.contexts_items_features is not None: - contexts_items_features = contexts_items_features[0] + if fixed_items_features is not None: + for i in range(len(fixed_items_features)): + fixed_items_features[i] = fixed_items_features[i].astype( + self._return_types[0][i] + ) + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self._return_items_features_tuple: + fixed_items_features = fixed_items_features[0] + else: + fixed_items_features = tuple(fixed_items_features) + + if contexts_features is not None: + for i in range(len(contexts_features)): + contexts_features[i] = contexts_features[i].astype(self._return_types[1][i]) + if not self._return_contexts_features_tuple: + contexts_features = contexts_features[0] + else: + contexts_features = tuple(contexts_features) + + if contexts_items_features is not None: + for i in range(len(contexts_items_features)): + contexts_items_features[i] = contexts_items_features[i].astype( + self._return_types[2][i] + ) + # sessions_items_features were not given as a tuple, so we return do not return + # it as a tuple + if not self._return_contexts_items_features_tuple: + contexts_items_features = contexts_items_features[0] + else: + contexts_items_features = tuple(contexts_items_features) return ( fixed_items_features, @@ -922,16 +943,16 @@ def get_choices_batch(self, choices_indexes, features=None): else: contexts_items_availabilities = self.contexts_items_availabilities[choices_indexes] - for indexes, func in self.fixed_items_features_map: - fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + for indexes, f_by_id in self.fixed_items_features_map: + fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = f_by_id.batch[ fixed_items_features[indexes[0]][:, indexes[1]] ] - for indexes, func in self.contexts_features_map: - contexts_features[indexes[0]][indexes[1] : indexes[1] + 1] = func[ + for indexes, f_by_id in self.contexts_features_map: + contexts_features[indexes[0]][indexes[1] : indexes[1] + 1] = f_by_id.batch[ contexts_features[indexes[0]][indexes[1]] ] - for indexes, func in self.contexts_items_features_map: - contexts_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + for indexes, f_by_id in self.contexts_items_features_map: + contexts_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = f_by_id.batch[ contexts_items_features[indexes[0]][:, indexes[1]] ] diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 723ec7d0..8a1cca28 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -165,7 +165,7 @@ def _get_fixed_items_features(self): if self.choice_dataset.fixed_items_features is None: items_features = None else: - items_features = tuple( + items_features = list( items_feature.astype(self.choice_dataset._return_types[0][i]) for i, items_feature in enumerate(self.choice_dataset.fixed_items_features) ) @@ -200,11 +200,7 @@ def _get_contexts_features(self, contexts_indexes): ) ) else: - contexts_features.append( - np.stack(contexts_feature[contexts_indexes], axis=0).astype( - self.choice_dataset._return_types[1][i] - ) - ) + contexts_features.append(np.stack(contexts_feature[contexts_indexes], axis=0)) # sessions_features were not given as a tuple, so we return do not return it as a tuple # if not self.choice_dataset._return_contexts_features_tuple: # contexts_features = contexts_feature[0] @@ -286,27 +282,68 @@ def __getitem__(self, choices_indexes): ) ) - for indexes, func in self.choice_dataset.fixed_items_features_map: - fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ - fixed_items_features[indexes[0]][:, indexes[1]] - ] - for indexes, func in self.choice_dataset.contexts_features_map: - contexts_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ - contexts_features[indexes[0]][:, indexes[1]] - ] - for indexes, func in self.choice_dataset.contexts_items_features_map: - contexts_items_features[indexes[0]][:, :, indexes[1] : indexes[1] + 1] = func[ - contexts_items_features[indexes[0]][:, :, indexes[1]] - ] - # items_features were not given as a tuple, so we return do not return it as a tuple - if not self.choice_dataset._return_items_features_tuple: - fixed_items_features = fixed_items_features[0] - if not self.choice_dataset._return_contexts_features_tuple: - contexts_features = contexts_features[0] - # sessions_items_features were not given as a tuple, so we return do not return - # it as a tuple - if not self.choice_dataset._return_contexts_items_features_tuple: - contexts_items_features = contexts_items_features[0] + for indexes, f_by_id in self.choice_dataset.fixed_items_features_map: + fixed_items_features[indexes[0]] = np.concatenate( + [ + fixed_items_features[indexes[0]][:, : indexes[1]], + f_by_id.batch[fixed_items_features[indexes[0]][:, indexes[1]]], + fixed_items_features[indexes[0]][:, indexes[1] + 1 :], + ], + axis=1, + ) + for indexes, f_by_id in self.choice_dataset.contexts_features_map: + contexts_features[indexes[0]] = np.concatenate( + [ + contexts_features[indexes[0]][:, : indexes[1]], + f_by_id.batch[contexts_features[indexes[0]][:, indexes[1]]], + contexts_features[indexes[0]][:, indexes[1] + 1 :], + ], + axis=1, + ) + for indexes, f_by_id in self.choice_dataset.contexts_items_features_map: + contexts_items_features[indexes[0]][ + :, :, indexes[1] : indexes[1] + 1 + ] = f_by_id.batch[contexts_items_features[indexes[0]][:, :, indexes[1]]] + contexts_items_features[indexes[0]] = np.concatenate( + [ + contexts_items_features[indexes[0]][:, :, : indexes[1]], + f_by_id.batch[contexts_items_features[indexes[0]][:, :, indexes[1]]], + contexts_items_features[indexes[0]][:, :, indexes[1] + 1 :], + ], + axis=2, + ) + if fixed_items_features is not None: + for i in range(len(fixed_items_features)): + fixed_items_features[i] = fixed_items_features[i].astype( + self.choice_dataset._return_types[0][i] + ) + # items_features were not given as a tuple, so we return do not return it as a tuple + if not self.choice_dataset._return_items_features_tuple: + fixed_items_features = fixed_items_features[0] + else: + fixed_items_features = tuple(fixed_items_features) + + if contexts_features is not None: + for i in range(len(contexts_features)): + contexts_features[i] = contexts_features[i].astype( + self.choice_dataset._return_types[1][i] + ) + if not self.choice_dataset._return_contexts_features_tuple: + contexts_features = contexts_features[0] + else: + contexts_features = tuple(contexts_features) + + if contexts_items_features is not None: + for i in range(len(contexts_items_features)): + contexts_items_features[i] = contexts_items_features[i].astype( + self.choice_dataset._return_types[2][i] + ) + # sessions_items_features were not given as a tuple, so we return do not return + # it as a tuple + if not self.choice_dataset._return_contexts_items_features_tuple: + contexts_items_features = contexts_items_features[0] + else: + contexts_items_features = tuple(contexts_items_features) choices = self.choice_dataset.choices[choices_indexes].astype( self.choice_dataset._return_types[4] @@ -347,16 +384,16 @@ def __getitem__(self, choices_indexes): self.choice_dataset._return_types[3] ) ) - for indexes, func in self.choice_dataset.fixed_items_features_map: - fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + for indexes, f_by_id in self.choice_dataset.fixed_items_features_map: + fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = f_by_id.batch[ fixed_items_features[indexes[0]][:, indexes[1]] ] - for indexes, func in self.choice_dataset.contexts_features_map: - contexts_features[indexes[0]][indexes[1] : indexes[1] + 1] = func[ + for indexes, f_by_id in self.choice_dataset.contexts_features_map: + contexts_features[indexes[0]][indexes[1] : indexes[1] + 1] = f_by_id.batch[ contexts_features[indexes[0]][indexes[1]] ] - for indexes, func in self.choice_dataset.contexts_items_features_map: - contexts_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = func[ + for indexes, f_by_id in self.choice_dataset.contexts_items_features_map: + contexts_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = f_by_id.batch[ contexts_items_features[indexes[0]][:, indexes[1]] ] diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py index a96167aa..992a3e62 100644 --- a/choice_learn/data/storage.py +++ b/choice_learn/data/storage.py @@ -51,6 +51,8 @@ def __init__(self, ids=None, values=None, values_names=None, name=None, indexer= assert isinstance(v, np.ndarray) | isinstance(v, list) assert len(np.array(v).shape) == 1 lengths.append(len(v)) + if isinstance(v, list): + storage[k] = np.array(v) assert len(set(lengths)) == 1 elif isinstance(values, pd.DataFrame): @@ -63,7 +65,7 @@ def __init__(self, ids=None, values=None, values_names=None, name=None, indexer= elif isinstance(values, list) or isinstance(values, np.ndarray): if ids is None: ids = list(range(len(values))) - storage = {k: v for (k, v) in zip(ids, values)} + storage = {k: np.array(v) for (k, v) in zip(ids, values)} else: raise ValueError("values must be a dict, a DataFrame, a list or a numpy array") @@ -74,7 +76,7 @@ def __init__(self, ids=None, values=None, values_names=None, name=None, indexer= self.shape = (len(self), len(next(iter(self.storage.values())))) self.indexer = indexer(self) - def _get_store_element(self, index): + def get_element_from_index(self, index): """Getter method over self.sequence. Returns the features stored at index index. Compared to __getitem__, it does take @@ -90,10 +92,8 @@ def _get_store_element(self, index): array_like features corresponding to the index index in self.store """ - if isinstance(index, list): - return [self.store[i] for i in index] - # else: - return self.store[index] + keys = list(self.storage.keys())[index] + return self.storage[keys] def __len__(self): """Returns the length of the sequence of apparition of the features.""" @@ -110,6 +110,17 @@ def __getitem__(self, keys): sub_storage = {k: self.storage[k] for k in keys} return FeaturesStorage(values=sub_storage, values_names=self.values_names, name=self.name) + def get_storage_type(self): + """Functions to access stored elements dtypes. + + Returns: + -------- + tuple + tuple of dtypes of the stored elements, as returned by np.dtype + """ + element = self.get_element_from_index(0) + return element.dtype + @property def batch(self): """Indexing attribute.""" From 4a233983d0a4f663e03a7856fe382a0090bb2e01 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Sat, 13 Jan 2024 21:48:28 +0100 Subject: [PATCH 14/22] DOC: redocumented --- choice_learn/data/choice_dataset.py | 152 +++++++++++++++------- choice_learn/data/indexer.py | 112 ++++++++++------ choice_learn/data/storage.py | 193 +++++++++++++++++++++++++--- 3 files changed, 356 insertions(+), 101 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index a98577b6..a261103a 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -26,9 +26,9 @@ def __init__( choices, # Should not have None as default value ? fixed_items_features=None, contexts_features=None, # as many context as choices. values or ids (look at key) - contexts_items_features=None, # MUST INCLUDE item_id column; possible: column "available" binary + contexts_items_features=None, contexts_items_availabilities=None, - features_by_ids=[], # list of (name, FeaturesStorage) --> requires to have df with col "_id" + features_by_ids=[], # list of (name, FeaturesStorage) fixed_items_features_names=None, contexts_features_names=None, contexts_items_features_names=None, @@ -37,16 +37,33 @@ def __init__( Parameters ---------- - items_features : tuple of (array_like, ) + choices: list or np.ndarray + list of chosen items indexes + fixed_items_features : tuple of (array_like, ) matrix of shape (num_items, num_items_features) containing the features of the items - e.g. item color - choices: list of list - for each choice we have a list of related choices. Main list has same legnth as - session_features. - suffle: bool, optional - whether to shuffle the dataset or not + that never change, e.g. item color, default is None + contexts_features : tuple of (array_like, ) + matrix of shape (num_choices, num_contexts_features) containing the features of the + different contexts that are common to all items (e.g. store features, + customer features, etc...) + contexts_items_features : tuple of (array_like, ), default is None + matrix of shape (num_choices, num_items, num_contexts_items_features) + containing the features + of the items that change over time (e.g. price, promotion, etc...), default is None + contexts_items_availabilities : array_like + matrix of shape (num_choices, num_items) containing the availabilities of the items + over the different choices, default is None + features_by_ids : list of (name, FeaturesStorage) + List of Storage objects. Their name must correspond to a feature name + among fixed_items, contexts, + contexts_items and their ids must match to those features values. Default is [] + fixed_items_features_names : tuple of (array_like, ) + list of names of the fixed_items_features, default is None + contexts_features_names : tuple of (array_like, ) + list of names of the contexts_features, default is None + contexts_items_features_names : tuple of (array_like, ) + list of names of the contexts_items_features, default is None """ - if choices is None: # Done to keep a logical order of arguments, and has logic: choices have to be specified raise ValueError("Choices must be specified, got None") @@ -58,9 +75,11 @@ def __init__( if fixed_items_features is not None: if not isinstance(fixed_items_features, tuple): if fixed_items_features_names is not None: - assert len(fixed_items_features) == len( - fixed_items_features_names - ), "Number of features given does not match number of features names given." + if len(fixed_items_features) == len(fixed_items_features_names): + raise ValueError( + """Number of features given does not match number + of features names given.""" + ) self._return_items_features_tuple = False fixed_items_features = (fixed_items_features,) @@ -87,9 +106,11 @@ def __init__( if not isinstance(contexts_features, tuple): self._return_contexts_features_tuple = False if contexts_items_features_names is not None: - assert len(contexts_features[0]) == len( - contexts_features_names - ), "Number of features given does not match number of features names given." + if len(contexts_features[0]) == len(contexts_features_names): + raise ValueError( + """Number of features given does not match + number of features names given.""" + ) contexts_features_names = (contexts_features_names,) contexts_features = (contexts_features,) @@ -101,7 +122,8 @@ def __init__( for f, name in zip(contexts_features, contexts_features_names): if len(f[0]) != len(name): raise ValueError( - "contexts_features shape and contexts_features_names shape do not match" + """contexts_features shape and contexts_features_names + shape do not match""" ) # In this case names are missing, still transform it as a tuple @@ -113,9 +135,11 @@ def __init__( if not isinstance(contexts_items_features, tuple) and contexts_items_features is not None: self._return_contexts_items_features_tuple = False if contexts_items_features_names is not None: - assert ( - len(contexts_items_features[0][0]) == len(contexts_items_features_names) - ), "Number of features given does not match number of features names given for contexts_items." + if len(contexts_items_features[0][0]) != len(contexts_items_features_names): + raise ValueError( + """Number of features given does not match + number of features names given for contexts_items.""" + ) contexts_items_features = (contexts_items_features,) contexts_items_features_names = (contexts_items_features_names,) @@ -124,7 +148,8 @@ def __init__( for f, name in zip(contexts_items_features, contexts_items_features_names): if len(f[0][0]) != len(name): raise ValueError( - "contexts_items_features shape and contexts_items_features_names shape do not match" + """contexts_items_features shape and + contexts_items_features_names shape do not match""" ) self._return_contexts_items_features_tuple = True # In this case names are missing, still transform it as a tuple @@ -295,6 +320,20 @@ def __init__( self.indexer = ChoiceDatasetIndexer(self) def _build_features_by_ids(self): + """Builds mapping function. + + Those mapping functions are so that at indexing, + the features are rebuilt with the features by id. + + Returns: + -------- + tuple + indexes and features_by_id of fixed_items_features + tuple + indexes and features_by_id of contexts_features + tuple + indexes and features_by_id of contexts_items_features + """ if len(self.features_by_ids) == 0: print("No features_by_ids given.") return [], [], [] @@ -336,9 +375,10 @@ def _build_features_by_ids(self): if column_name == feature_by_id.name: contexts_items_features_map.append(((i, j), feature_by_id)) - assert len(fixed_items_features_map) + len(contexts_features_map) + len( + if len(fixed_items_features_map) + len(contexts_features_map) + len( contexts_items_features_map - ) == len(self.features_by_ids), "Some features_by_ids were not matched with features_names." + ) == len(self.features_by_ids): + raise ValueError("Some features_by_ids were not matched with features_names.") return fixed_items_features_map, contexts_features_map, contexts_items_features_map @@ -569,9 +609,9 @@ def _contexts_items_features_df_to_np( contexts_index, features, items_id_column="item_id", - contexts_id_column="session_id", + contexts_id_column="contexts_id", ): - """Builds sessions_items_features and sessions_items_availabilities from dataframe. + """Builds contexts_items_features and contexts_items_availabilities from dataframe. Parameters ---------- @@ -579,20 +619,24 @@ def _contexts_items_features_df_to_np( Dataframe containing all the features for each item and sessions items_index : list List of items - sessions_index : list + contexts_index : list List of sessions features : list List of columns of df that represents the items_features (for sessions_items_features) + items_id_column: str, optional + Name of the column containing the item ids, default is "items_id" + contexts_id_column: str, optional + Name of the column containing the sessions ids, default is "contexts_id" Returns: ------- - np.ndarray of shape (n_sessions, n_items, n_features) - Corresponding sessions_items_features - np.ndarray of shape (n_sessions, n_items) + np.ndarray of shape (n_choices, n_items, n_features) + Corresponding contexts_items_features + np.ndarray of shape (n_choices, n_items) Corresponding availabilities """ try: - features.remove("session_id") + features.remove("context_id") except ValueError: pass try: @@ -637,9 +681,9 @@ def _contexts_items_features_df_to_np( def from_single_df( cls, df, - fixed_items_features_columns, - contexts_features_columns, - contexts_items_features_columns, + fixed_items_features_columns=None, + contexts_features_columns=None, + contexts_items_features_columns=None, items_id_column="item_id", contexts_id_column="context_id", choices_column="choice", @@ -651,18 +695,21 @@ def from_single_df( ---------- df : pandas.DataFrame dataframe in Long format - items_features_columns : list - Columns of the dataframe that are item features - sessions_features_columns : list - Columns of the dataframe that are session features - sessions_items_features_columns : list - Columns of the dataframe that are session-item features + fixed_items_features_columns : list + Columns of the dataframe that are item features, default is None + contexts_features_columns : list + Columns of the dataframe that are contexts features, default is None + contexts_items_features_columns : list + Columns of the dataframe that are context-item features, default is None items_id_column: str, optional Name of the column containing the item ids, default is "items_id" - sessions_id_column: str, optional - Name of the column containing the sessions ids, default is "sessions_id" + contexts_id_column: str, optional + Name of the column containing the sessions ids, default is "contexts_id" choices_column: str, optional Name of the column containing the choices, default is "choice" + choice_mode: str, optional + How choice is indicated in df, either "items_name" or "one_zero", + default is "items_name" Returns: ------- @@ -795,10 +842,25 @@ def get_choices_batch(self, choices_indexes, features=None): Parameters ---------- - index : int or list of int or slice + choices_indexes : int or list of int or slice indexes of the choices (that will be mapped to choice & session indexes) to return + features : list of str, optional + list of features to return. None returns all of them, default is None. + Returns: + -------- + tuple of (array_like, ) + tuple of arrays containing the features of the different items + tuple of (array_like, ) + tuple of arrays containing the features of the different contexts + tuple of (array_like, ) + tuple of arrays containing the features of the different contexts_items + array_like + array containing the availabilities of the different items + array_like + array containing the choices (indexes of chosen items) """ + _ = features if isinstance(choices_indexes, list): if self.fixed_items_features is None: fixed_items_features = None @@ -969,8 +1031,8 @@ def __getitem__(self, choices_indexes): Parameters ---------- - indexes : np.ndarray - indexes of the sessions to keep, shape should be (num_sessions,) + choices_indexes : np.ndarray + indexes of the contexts / choices to keep, shape should be (num_choices,) Returns: ------- @@ -1002,7 +1064,7 @@ def __getitem__(self, choices_indexes): @property def batch(self): - """Indexer.""" + """Indexer. Corresponds to get_choice_batch, but with [] logic.""" return self.indexer def iter_batch(self, batch_size, shuffle=None, sample_weight=None): diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 8a1cca28..6c9137dc 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -74,8 +74,8 @@ def __init__(self, storage): Parameters ---------- - store : choice_modeling.data.store.FeaturesStore - Store object to be indexed. + storage : choice_modeling.data.store.FeaturesStorage + Storage object to be indexed. """ self.storage = storage @@ -84,13 +84,13 @@ def __getitem__(self, sequence_keys): Parameters ---------- - sequence_index : (int, list, slice) - index position of the sequence + sequence_keys : (int, list, slice) + keys of values to be retrieved Returns: -------- array_like - features corresponding to the sequence_index-th position of sequence + features corresponding to the sequence_keys """ if isinstance(sequence_keys, Iterable): return np.array([self.storage.storage[key] for key in sequence_keys]) @@ -99,6 +99,48 @@ def __getitem__(self, sequence_keys): return np.array(self.storage.storage[sequence_keys]) +class OneHotStorageIndexer(Indexer): + """Class for Ilocing OneHotStorage.""" + + def __init__(self, storage): + """OneHotStorageIndexer constructor. + + Parameters + ---------- + storage : choice_modeling.data.store.OneHotStorage + OneHotStorage object to be indexed. + """ + self.storage = storage + self.shape = storage.shape + self.dtype = storage.dtype + + def __getitem__(self, sequence_keys): + """Get the 1 indexes corresponding to the sequence_keys and builds the OneHot matrix. + + Parameters + ---------- + sequence_keys : (int, list, slice) + keys of values to be retrieved + + Returns: + -------- + np.ndarray + OneHot reconstructed vectors corresponding to sequence_keys + """ + if isinstance(sequence_keys, list): + # Construction of the OneHot vector from the index of the 1 value + one_hot = np.zeros((len(sequence_keys), self.shape[1])) + for i, j in enumerate(sequence_keys): + one_hot[i, self.storage.storage[j]] = 1 + return one_hot.astype(self.dtype) + if isinstance(sequence_keys, slice): + return self[list(range(*sequence_keys.indices(len(self.shape[0]))))] + # else: + one_hot = np.zeros(self.shape[1]) + one_hot[self.storage.storage[sequence_keys]] = 1 + return one_hot.astype(self.dtype) + + class OneHotStoreIndexer(Indexer): """Class for Ilocing OneHotStore.""" @@ -169,24 +211,20 @@ def _get_fixed_items_features(self): items_feature.astype(self.choice_dataset._return_types[0][i]) for i, items_feature in enumerate(self.choice_dataset.fixed_items_features) ) - # items_features were not given as a tuple, so we return do not return it as a tuple - # if not self.choice_dataset._return_items_features_tuple: - # items_features = items_features[0] - return items_features - def _get_contexts_features(self, contexts_indexes): + def _get_contexts_features(self, choices_indexes): """Method to access sessions features of the ChoiceDataset. Parameters ---------- - sessions_indexes : list of ints or int - indexes of the sessions to return + choices_indexes : list of ints or int + choices indexes of the contexts features to return Returns: -------- tuple of np.ndarray or np.ndarray - items_features of the ChoiceDataset + right indexed contexts_features of the ChoiceDataset """ if self.choice_dataset.contexts_features is None: contexts_features = None @@ -195,31 +233,26 @@ def _get_contexts_features(self, contexts_indexes): for i, contexts_feature in enumerate(self.choice_dataset.contexts_features): if hasattr(contexts_feature, "batch"): contexts_features.append( - contexts_feature.batch[contexts_indexes].astype( + contexts_feature.batch[choices_indexes].astype( self.choice_dataset._return_types[1][i] ) ) else: - contexts_features.append(np.stack(contexts_feature[contexts_indexes], axis=0)) - # sessions_features were not given as a tuple, so we return do not return it as a tuple - # if not self.choice_dataset._return_contexts_features_tuple: - # contexts_features = contexts_feature[0] - # else: - # contexts_features = tuple(contexts_features) + contexts_features.append(np.stack(contexts_feature[choices_indexes], axis=0)) return contexts_features - def _get_contexts_items_features(self, contexts_indexes): + def _get_contexts_items_features(self, choices_indexes): """Method to access sessions items features of the ChoiceDataset. Parameters ---------- - sessions_indexes : list of ints or int - indexes of the sessions to return + choices_indexes : list of ints or int + indexes of the choices for which we want the contexts items features Returns: -------- tuple of np.ndarray or np.ndarray - items_features of the ChoiceDataset + right indexes contexts_items_features of the ChoiceDataset """ if self.choice_dataset.contexts_items_features is None: return None @@ -227,19 +260,14 @@ def _get_contexts_items_features(self, contexts_indexes): for i, contexts_items_feature in enumerate(self.choice_dataset.contexts_items_features): if hasattr(contexts_items_feature, "iloc"): contexts_items_features.append( - contexts_items_feature.iloc[contexts_indexes].astype(self._return_types[2][i]) + contexts_items_feature.iloc[choices_indexes].astype(self._return_types[2][i]) ) else: contexts_items_features.append( - np.stack(contexts_items_feature[contexts_indexes], axis=0).astype( + np.stack(contexts_items_feature[choices_indexes], axis=0).astype( self.choice_dataset._return_types[2][i] ) ) - # sessions_items_features were not given as a tuple, thus we do not return it as a tuple - # if self.choice_dataset._return_contexts_items_features_tuple: - # contexts_items_features = tuple(contexts_items_features) - # else: - # contexts_items_features = contexts_items_features[0] return contexts_items_features def __getitem__(self, choices_indexes): @@ -248,24 +276,24 @@ def __getitem__(self, choices_indexes): One index corresponds to a choice within a session. Return order: - Fixed item features - - Session features - - Session item features + - Contexts features + - Contexts item features - Items availabilities - - Choice + - Choices Parameters ---------- - index : int or list of int or slice + choices_indexes : int or list of int or slice indexes of the choices (that will be mapped to choice & session indexes) to return """ if isinstance(choices_indexes, list): + # Get the features fixed_items_features = self._get_fixed_items_features() - - # Get the session indexes contexts_features = self._get_contexts_features(choices_indexes) contexts_items_features = self._get_contexts_items_features(choices_indexes) + # Get availabilities if self.choice_dataset.contexts_items_availabilities is None: contexts_items_availabilities = None else: @@ -281,7 +309,7 @@ def __getitem__(self, choices_indexes): self.choice_dataset._return_types[3] ) ) - + # Get choices for indexes, f_by_id in self.choice_dataset.fixed_items_features_map: fixed_items_features[indexes[0]] = np.concatenate( [ @@ -291,6 +319,7 @@ def __getitem__(self, choices_indexes): ], axis=1, ) + # Features by ID mapping for indexes, f_by_id in self.choice_dataset.contexts_features_map: contexts_features[indexes[0]] = np.concatenate( [ @@ -312,6 +341,7 @@ def __getitem__(self, choices_indexes): ], axis=2, ) + # Shaping and typing if fixed_items_features is not None: for i in range(len(fixed_items_features)): fixed_items_features[i] = fixed_items_features[i].astype( @@ -363,12 +393,12 @@ def __getitem__(self, choices_indexes): ) if isinstance(choices_indexes, int): + # Get the features fixed_items_features = self._get_fixed_items_features() - # Get the session indexes - contexts_features = self._get_contexts_features(choices_indexes) contexts_items_features = self._get_contexts_items_features(choices_indexes) + # Get availabilities if self.choice_dataset.contexts_items_availabilities is None: contexts_items_availabilities = None else: @@ -384,6 +414,7 @@ def __getitem__(self, choices_indexes): self.choice_dataset._return_types[3] ) ) + # Features by ID mapping for indexes, f_by_id in self.choice_dataset.fixed_items_features_map: fixed_items_features[indexes[0]][:, indexes[1] : indexes[1] + 1] = f_by_id.batch[ fixed_items_features[indexes[0]][:, indexes[1]] @@ -397,6 +428,7 @@ def __getitem__(self, choices_indexes): contexts_items_features[indexes[0]][:, indexes[1]] ] + # Shaping and typing # items_features were not given as a tuple, so we return do not return it as a tuple if not self.choice_dataset._return_items_features_tuple: fixed_items_features = fixed_items_features[0] diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py index 992a3e62..365f84de 100644 --- a/choice_learn/data/storage.py +++ b/choice_learn/data/storage.py @@ -4,56 +4,79 @@ import numpy as np import pandas as pd -from choice_learn.data.indexer import StorageIndexer +from choice_learn.data.indexer import OneHotStorageIndexer, StorageIndexer class Storage(ABC): - """Class to keep OneHotStore and FeaturesStore with same parent.""" + """Parent Class to have OneHotStorage and FeaturesStorage with same parent.""" def __init__(self, features_to_store): + """Instantiation. + + Parameters + ---------- + features_to_store : object + Object to store + """ self.features_to_store = features_to_store @abstractmethod def __getitem__(self, keys): + """Base function to access an element. To be implemented in children classes. + + Parameters + ---------- + keys : float, int, str or list of + values among indexes of the stiage + """ pass @abstractmethod def __len__(self): + """Returns the length of the sequence of apparition of the features.""" pass @property def batch(self): + """Indexing method.""" pass class FeaturesStorage(Storage): - """Class to keep OneHotStore and FeaturesStore with same parent.""" + """Function to store features with ids.""" def __init__(self, ids=None, values=None, values_names=None, name=None, indexer=StorageIndexer): """Builds the store. Parameters ---------- - indexes : array_like or None - list of indexes of features to store. If None is given, indexes are created from + ids : array_like or None + list of ids of features to store. If None is given, ids are created from apparition order of values values : array_like list of values of features to store - sequence : array_like - sequence of apparitions of the features + values_names : array_like + Iterable of str indicating the name of the features. Must be same length as values. name: string, optional - name of the features store -- not used at the moment + name of the features store """ if isinstance(values, dict): storage = values lengths = [] for k, v in storage.items(): - assert isinstance(v, np.ndarray) | isinstance(v, list) - assert len(np.array(v).shape) == 1 + if not isinstance(v, np.ndarray) | isinstance(v, list): + raise ValueError("values must be a dict of np.ndarray or list") + if not len(np.array(v).shape) == 1: + raise ValueError( + "values (features) must be a dict of np.ndarray or list of 1D arrays" + ) lengths.append(len(v)) if isinstance(v, list): storage[k] = np.array(v) - assert len(set(lengths)) == 1 + if not len(set(lengths)) == 1: + raise ValueError("values (dict values) must all have same length") + if ids is not None: + print("Warning: ids is ignored when values is a dict") elif isinstance(values, pd.DataFrame): if values_names is not None: @@ -99,15 +122,20 @@ def __len__(self): """Returns the length of the sequence of apparition of the features.""" return len(self.storage) - def __getitem__(self, keys): - """_summary_. + def __getitem__(self, id_keys): + """Subset FeaturesStorage, keeping only features which id is in keys. Parameters ---------- - keys : _type_ - _description_ + id_keys : Iterable + List of ids to keep. + + Returns: + -------- + FeaturesStorage + Subset of the FeaturesStorage, with only the features whose id is in id_keys """ - sub_storage = {k: self.storage[k] for k in keys} + sub_storage = {k: self.storage[k] for k in id_keys} return FeaturesStorage(values=sub_storage, values_names=self.values_names, name=self.name) def get_storage_type(self): @@ -125,3 +153,136 @@ def get_storage_type(self): def batch(self): """Indexing attribute.""" return self.indexer + + +class OneHotStorage(Storage): + """Specific Storage for one hot features storage. + + Inherits from Storage. + For example can be used to store a OneHot representation of the days of week. + + Has the same attributes as FeaturesStoage, only differs whit some One-Hot optimized methods. + It only stores the indexes of the features, and creates the OneHot matrix + when needed, using .batch[]. + """ + + def __init__( + self, ids=None, values=None, name=None, dtype=np.uint8, indexer=OneHotStorageIndexer + ): + """Builds the store. + + Parameters + ---------- + ids : array_like or None + list of ids of features to store. If None is given, ids are created from + apparition order of values + values : array_like + list of values of features to store + dtype: type + type for One Hot representation, usually int or float, default is np.uint8 + name: string, optional + name of the features store + """ + if isinstance(values, dict): + storage = values + for k, v in storage.items(): + if not isinstance(v, int): + raise ValueError( + """values of values dict must be int as + they are indexes of the one hot vector ones.""" + ) + length = np.max(list(storage.values())) + 1 + if ids is not None: + print("Warning: ids is ignored when values is a dict") + + elif isinstance(values, list) or isinstance(values, np.ndarray): + if ids is None: + ids = list(range(len(values))) + storage = {k: int(v) for (k, v) in zip(ids, values)} + length = np.max(values) + 1 + + elif values is None: + if ids is None: + raise ValueError("ids or values must be given, both are None") + value = 0 + storage = {} + for id in ids: + storage[id] = value + value += 1 + length = value + else: + raise ValueError("values must be a dict, a DataFrame, a list or a numpy array") + + self.storage = storage + self.name = name + + self.shape = (len(self), length) + self.dtype = dtype + self.indexer = indexer(self) + + def __len__(self): + """Returns the length of the sequence of apparition of the features.""" + return len(self.storage) + + def __getitem__(self, id_keys): + """Subset FeaturesStorage, keeping only features which id is in keys. + + Parameters + ---------- + id_keys : Iterable + List of ids to keep. + + Returns: + -------- + OneHotStorage + Subset of the OneHotStorage, with only the features whose id is in id_keys + """ + if isinstance(id_keys, int): + id_keys = [id_keys] + sub_storage = {k: self.storage[k] for k in id_keys} + + return OneHotStorage(values=sub_storage, name=self.name, dtype=self.dtype) + + def astype(self, dtype): + """Method to change (mainly int or float) type of returned OneHot features vectors. + + Parameters + ---------- + dtype : type + Type to set the features as + """ + self.dtype = dtype + + def get_element_from_index(self, index): + """Getter method over self.sequence. + + Returns the features stored at index index. Compared to __getitem__, it does take + the index-th element of sequence but the index-th element of the store. + + Parameters + ---------- + index : (int, list, slice) + index argument of the feature + + Returns: + -------- + array_like + features corresponding to the index index in self.store + """ + keys = list(self.storage.keys())[index] + return self.storage[keys] + + def get_storage_type(self): + """Functions to access stored elements dtypes. + + Returns: + -------- + type + tuple of dtypes of the stored elements, as returned by np.dtype + """ + return self.dtype + + @property + def batch(self): + """Indexing attribute.""" + return self.indexer From dc6b719874a54282b79f3067691d3d751240fc5e Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 10:01:35 +0100 Subject: [PATCH 15/22] ENH: ConditionalMNL creates ModelSpecification on its own --- choice_learn/models/base_model.py | 10 +- choice_learn/models/conditional_mnl.py | 270 ++++++++++++++++--------- 2 files changed, 176 insertions(+), 104 deletions(-) diff --git a/choice_learn/models/base_model.py b/choice_learn/models/base_model.py index 38d4b220..a5560f93 100644 --- a/choice_learn/models/base_model.py +++ b/choice_learn/models/base_model.py @@ -472,13 +472,15 @@ def load_model(cls, path): # Load optimizer step return cls - def predict_probas(self, choice_dataset): + def predict_probas(self, choice_dataset, batch_size=-1): """Predicts the choice probabilities for each session and each product of a ChoiceDataset. Parameters ---------- choice_dataset : ChoiceDataset Dataset on which to apply to prediction + batch_size : int, optional + Batch size to use for the prediction, by default -1 Returns: -------- @@ -492,7 +494,7 @@ def predict_probas(self, choice_dataset): sessions_items_batch, availabilities_batch, choices_batch, - ) in choice_dataset.iter_batch(): + ) in choice_dataset.iter_batch(batch_size=batch_size): _, probabilities = self.batch_predict( items_batch, sessions_batch, @@ -504,7 +506,7 @@ def predict_probas(self, choice_dataset): return tf.concat(stacked_probabilities, axis=0) - def evaluate(self, choice_dataset, batch_size=None): + def evaluate(self, choice_dataset, batch_size=-1): """Evaluates the model for each session and each product of a ChoiceDataset. Predicts the probabilities according to the model and computes the Negative-Log-Likelihood @@ -520,8 +522,6 @@ def evaluate(self, choice_dataset, batch_size=None): np.ndarray (n_sessions, n_items) Choice probabilties for each session and each product """ - if batch_size is None: - batch_size = choice_dataset.batch_size batch_losses = [] for ( items_batch, diff --git a/choice_learn/models/conditional_mnl.py b/choice_learn/models/conditional_mnl.py index 9284797e..2e47451a 100644 --- a/choice_learn/models/conditional_mnl.py +++ b/choice_learn/models/conditional_mnl.py @@ -183,7 +183,7 @@ class ConditionalMNL(ChoiceModel): def __init__( self, - parameters, + parameters=None, add_exit_choice=False, optimizer="Adam", lr=0.001, @@ -207,6 +207,78 @@ def __init__( self.params = parameters self.instantiated = False + def add_coefficients( + self, coefficient_name, feature_name, items_indexes=None, items_names=None + ): + """Adds a coefficient to the model throught the specification of the utility. + + Parameters + ---------- + coefficient_name : str + Name given to the coefficient. + feature_name : str + features name to which the coefficient is associated. It should work with + the names given. + in the ChoiceDataset that will be used for parameters estimation. + items_indexes : list of int, optional + list of items indexes (in the ChoiceDataset) for which we need to add a coefficient, + by default None + items_names : list of str, optional + list of items names (in the ChoiceDataset) for which we need to add a coefficient, + by default None + + Raises: + ------- + ValueError + When names or indexes are both not specified. + """ + if self.params is None: + self.params = ModelSpecification() + elif not isinstance(self.params, ModelSpecification): + raise ValueError("Cannot add coefficient on top of a dict instantiation.") + self.params.add_coefficients( + coefficient_name=coefficient_name, + feature_name=feature_name, + items_indexes=items_indexes, + items_names=items_names, + ) + + def add_shared_coefficient( + self, coefficient_name, feature_name, items_indexes=None, items_names=None + ): + """Adds a single, shared coefficient to the model throught the specification of the utility. + + Parameters + ---------- + coefficient_name : str + Name given to the coefficient. + feature_name : str + features name to which the coefficient is associated. It should work with + the names given. + in the ChoiceDataset that will be used for parameters estimation. + items_indexes : list of int, optional + list of items indexes (in the ChoiceDataset) for which the coefficient will be used, + by default None + items_names : list of str, optional + list of items names (in the ChoiceDataset) for which the coefficient will be used, + by default None + + Raises: + ------- + ValueError + When names or indexes are both not specified. + """ + if self.params is None: + self.params = ModelSpecification() + elif not isinstance(self.params, ModelSpecification): + raise ValueError("Cannot add shared coefficient on top of a dict instantiation.") + self.params.add_shared_coefficient( + coefficient_name=coefficient_name, + feature_name=feature_name, + items_indexes=items_indexes, + items_names=items_names, + ) + def instantiate_from_specifications(self): """Instantiate the model from ModelSpecification object. @@ -247,15 +319,15 @@ def _store_dataset_features_names(self, dataset): dataset : ChoiceDataset ChoiceDataset used to fit the model. """ - self._items_features_names = dataset.items_features_names - self._sessions_features_names = dataset.sessions_features_names - self._sessions_items_features_names = dataset.sessions_items_features_names + self._items_features_names = dataset.fixed_items_features_names + self._contexts_features_names = dataset.contexts_features_names + self._contexts_items_features_names = dataset.contexts_items_features_names def compute_utility_from_specification( self, items_batch, - sessions_batch, - sessions_items_batch, + contexts_batch, + contexts_items_batch, availabilities_batch, choices_batch, verbose=0, @@ -268,31 +340,31 @@ def compute_utility_from_specification( Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products constant/fixed features. Shape must be (n_items, n_items_features) - sessions_batch : tuple of np.ndarray (sessions_features) + contexts_batch : tuple of np.ndarray (contexts_features) Time-Features - Shape must be (n_sessions, n_sessions_features) - sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Shape must be (n_choices, n_contexts_features) + contexts_items_batch : tuple of np.ndarray (contexts_items_features) Time-Item-Features - Shape must be (n_sessions, n_sessions_items_features) + Shape must be (n_choices, n_contexts_items_features) availabilities_batch : np.ndarray - Availabilities (sessions_items_availabilities) - Shape must be (n_sessions, n_items) + Availabilities (contexts_items_availabilities) + Shape must be (n_choices, n_items) choices_batch : np.ndarray Choices - Shape must be (n_sessions, ) + Shape must be (n_choices, ) verbose : int, optional Parametrization of the logging outputs, by default 0 Returns: -------- tf.Tensor - Utilities corresponding of shape (n_sessions, n_items) + Utilities corresponding of shape (n_choices, n_items) """ del choices_batch, verbose num_items = availabilities_batch.shape[1] - num_sessions = availabilities_batch.shape[0] - sessions_items_utilities = [] + num_choices = availabilities_batch.shape[0] + contexts_items_utilities = [] # Items features for i, feat_tuple in enumerate(self._items_features_names): for j, feat in enumerate(feat_tuple): @@ -324,23 +396,23 @@ def compute_utility_from_specification( ], axis=0, ) - s_i_u = tf.stack([s_i_u] * num_sessions, axis=0) + s_i_u = tf.stack([s_i_u] * num_choices, axis=0) ### Need reshaping here - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) else: print( f"Feature {feat} is in dataset but has no weight assigned in utility\ computations" ) - # Session features - for i, feat_tuple in enumerate(self._sessions_features_names): + # Context features + for i, feat_tuple in enumerate(self._contexts_features_names): for j, feat in enumerate(feat_tuple): if feat in self.params.list_features_with_weights(): item_index, weight_index = self.params.get_weight_item_indexes(feat) - s_i_u = tf.zeros((num_sessions, num_items)) + s_i_u = tf.zeros((num_choices, num_items)) for q, idx in enumerate(item_index): if isinstance(idx, list): @@ -350,7 +422,7 @@ def compute_utility_from_specification( s_i_u[:, :k], tf.expand_dims( tf.multiply( - sessions_batch[i][:, j], + contexts_batch[i][:, j], self.weights[weight_index][:, q], ), axis=-1, @@ -365,7 +437,7 @@ def compute_utility_from_specification( s_i_u[:, :idx], tf.expand_dims( tf.multiply( - sessions_batch[i][:, j], + contexts_batch[i][:, j], self.weights[weight_index][:, q], ), axis=-1, @@ -375,19 +447,19 @@ def compute_utility_from_specification( axis=1, ) - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) else: print( f"Feature {feat} is in dataset but has no weight assigned in utility\ computations" ) - # Session Items features - for i, feat_tuple in enumerate(self._sessions_items_features_names): + # context Items features + for i, feat_tuple in enumerate(self._contexts_items_features_names): for j, feat in enumerate(feat_tuple): if feat in self.params.list_features_with_weights(): item_index, weight_index = self.params.get_weight_item_indexes(feat) - s_i_u = tf.zeros((num_sessions, num_items)) + s_i_u = tf.zeros((num_choices, num_items)) for q, idx in enumerate(item_index): if isinstance(idx, list): @@ -397,7 +469,7 @@ def compute_utility_from_specification( s_i_u[:, :k], tf.expand_dims( tf.multiply( - sessions_items_batch[i][:, k, j], + contexts_items_batch[i][:, k, j], self.weights[weight_index][:, q], ), axis=-1, @@ -412,7 +484,7 @@ def compute_utility_from_specification( s_i_u[:, :idx], tf.expand_dims( tf.multiply( - sessions_items_batch[i][:, idx, j], + contexts_items_batch[i][:, idx, j], self.weights[weight_index][:, q], ), axis=-1, @@ -422,7 +494,7 @@ def compute_utility_from_specification( axis=1, ) - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) else: print( f"Feature {feat} is in dataset but has no weight assigned in utility\ @@ -443,13 +515,13 @@ def compute_utility_from_specification( axis=0, ) - s_i_u = tf.stack([s_i_u] * num_sessions, axis=0) + s_i_u = tf.stack([s_i_u] * num_choices, axis=0) ### Need reshaping here - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) - return tf.reduce_sum(sessions_items_utilities, axis=0) + return tf.reduce_sum(contexts_items_utilities, axis=0) def instantiate_from_dict(self, num_items): """Instantiation of the model from a dictionnary specification. @@ -483,8 +555,8 @@ def instantiate( self, num_items, items_features_names, - sessions_features_names, - sessions_items_features_names, + contexts_features_names, + contexts_items_features_names, ): """Instantiate the model from self.params and a dataset. @@ -496,10 +568,10 @@ def instantiate( Number of different items in the assortment. Used to create the right number of weights. items_features_names : list of str Names of the items features in the dataset. - sessions_features_names : list of str - Names of the sessions features in the dataset. - sessions_items_features_names : list of str - Names of the sessions items features in the dataset. + contexts_features_names : list of str + Names of the contexts features in the dataset. + contexts_items_features_names : list of str + Names of the contexts items features in the dataset. Raises: ------- @@ -542,8 +614,8 @@ def instantiate( if len(tuple_names) > 0: self._items_features_names.append(tuple_names) - self._sessions_features_names = [] - for feat_tuple in sessions_features_names: + self._contexts_features_names = [] + for feat_tuple in contexts_features_names: tuple_names = [] for feat in feat_tuple: if feat in self.params.keys(): @@ -575,10 +647,10 @@ def instantiate( computations" ) if len(tuple_names) > 0: - self._sessions_features_names.append(tuple_names) + self._contexts_features_names.append(tuple_names) - self._sessions_items_features_names = [] - for feat_tuple in sessions_items_features_names: + self._contexts_items_features_names = [] + for feat_tuple in contexts_items_features_names: tuple_names = [] for feat in feat_tuple: if feat in self.params.keys(): @@ -600,7 +672,7 @@ def instantiate( name=feat, ) else: - for i, s_tuple in enumerate(sessions_features_names): + for i, s_tuple in enumerate(contexts_features_names): for j, s_feat in enumerate(s_tuple): if s_feat == self.params[feat]: # Get num weights with unique values of this feature @@ -628,7 +700,7 @@ def instantiate( ) if len(tuple_names) > 0: - self._sessions_items_features_names.append(tuple_names) + self._contexts_items_features_names.append(tuple_names) if "intercept" in self.params.keys(): if self.params["intercept"] == "constant": @@ -646,7 +718,7 @@ def instantiate( name="intercept", ) else: - # Is supposed to be in sessions_features_names + # Is supposed to be in contexts_features_names raise NotImplementedError(f"Param {self.params['intercept']} not implemented") weights.append(weight) else: @@ -659,7 +731,7 @@ def instantiate( return weights def compute_utility( - self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + self, items_batch, contexts_batch, contexts_items_batch, availabilities_batch, choices_batch ): """Main method to compute the utility of the model. Selects the right method to compute. @@ -669,75 +741,75 @@ def compute_utility( Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products constant/fixed features. Shape must be (n_items, n_items_features) - sessions_batch : tuple of np.ndarray (sessions_features) + contexts_batch : tuple of np.ndarray (contexts_features) Time-Features - Shape must be (n_sessions, n_sessions_features) - sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Shape must be (n_choices, n_contexts_features) + contexts_items_batch : tuple of np.ndarray (contexts_items_features) Time-Item-Features - Shape must be (n_sessions, n_sessions_items_features) + Shape must be (n_choices, n_contexts_items_features) availabilities_batch : np.ndarray - Availabilities (sessions_items_availabilities) - Shape must be (n_sessions, n_items) + Availabilities (contexts_items_availabilities) + Shape must be (n_choices, n_items) choices_batch : np.ndarray - Choices Shape must be (n_sessions, ) + Choices Shape must be (n_choices, ) Returns: -------- tf.Tensor - Computed utilities of shape (n_sessions, n_items). + Computed utilities of shape (n_choices, n_items). """ if isinstance(self.params, ModelSpecification): return self.compute_utility_from_specification( items_batch, - sessions_batch, - sessions_items_batch, + contexts_batch, + contexts_items_batch, availabilities_batch, choices_batch, ) return self.compute_utility_from_dict( items_batch, - sessions_batch, - sessions_items_batch, + contexts_batch, + contexts_items_batch, availabilities_batch, choices_batch, ) def compute_utility_from_dict( - self, items_batch, sessions_batch, sessions_items_batch, availabilities_batch, choices_batch + self, items_batch, contexts_batch, contexts_items_batch, availabilities_batch, choices_batch ): """Computes the utility when the model is constructed from a dictionnary object. Parameters ---------- - tems_batch : tuple of np.ndarray (items_features) + items_batch : tuple of np.ndarray (items_features) Fixed-Item-Features: formatting from ChoiceDataset: a matrix representing the products constant/fixed features. Shape must be (n_items, n_items_features) - sessions_batch : tuple of np.ndarray (sessions_features) + contexts_batch : tuple of np.ndarray (contexts_features) Time-Features - Shape must be (n_sessions, n_sessions_features) - sessions_items_batch : tuple of np.ndarray (sessions_items_features) + Shape must be (n_choices, n_contexts_features) + contexts_items_batch : tuple of np.ndarray (contexts_items_features) Time-Item-Features - Shape must be (n_sessions, n_sessions_items_features) + Shape must be (n_choices, n_contexts_items_features) availabilities_batch : np.ndarray - Availabilities (sessions_items_availabilities) - Shape must be (n_sessions, n_items) + Availabilities (contexts_items_availabilities) + Shape must be (n_choices, n_items) choices_batch : np.ndarray Choices - Shape must be (n_sessions, ) + Shape must be (n_choices, ) verbose : int, optional Parametrization of the logging outputs, by default 0 Returns: -------- tf.Tensor - Utilities corresponding of shape (n_sessions, n_items) + Utilities corresponding of shape (n_choices, n_items) """ del availabilities_batch, choices_batch - sessions_items_utilities = [] + contexts_items_utilities = [] num_items = items_batch[0].shape[0] - num_sessions = sessions_batch[0].shape[0] + num_choices = contexts_batch[0].shape[0] # Items features for i, feat_tuple in enumerate(self._items_features_names): @@ -746,64 +818,64 @@ def compute_utility_from_dict( weight = self.weights[k] if self.params[feat] == "constant": s_i_u = tf.concat( - [tf.multiply(items_batch[i][:, j], weight)] * num_sessions, axis=0 + [tf.multiply(items_batch[i][:, j], weight)] * num_choices, axis=0 ) elif self.params[feat] == "item": weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) s_i_u = tf.concat( - [tf.multiply(items_batch[i][:, j], weight)] * num_sessions, axis=0 + [tf.multiply(items_batch[i][:, j], weight)] * num_choices, axis=0 ) elif self.params[feat] == "item-full": s_i_u = tf.concat( - [tf.multiply(items_batch[i][:, j], weight)] * num_sessions, axis=0 + [tf.multiply(items_batch[i][:, j], weight)] * num_choices, axis=0 ) else: raise NotImplementedError(f"Param {self.params[feat]} not implemented") - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) else: print( f"Feature {feat} is in dataset but has no weight assigned in utility \ computations" ) - # Session features - for i, feat_tuple in enumerate(self._sessions_features_names): + # context features + for i, feat_tuple in enumerate(self._contexts_features_names): for j, (feat, k) in enumerate(feat_tuple): if feat in self.params.keys(): weight = self.weights[k] if self.params[feat] == "constant": s_i_u = tf.concat( - [tf.multiply(sessions_batch[i][j], weight)] * num_items, axis=-1 + [tf.multiply(contexts_batch[i][j], weight)] * num_items, axis=-1 ) elif self.params[feat] == "item": weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) - s_i_u = tf.tensordot(sessions_batch[i][:, j : j + 1], weight, axes=1) + s_i_u = tf.tensordot(contexts_batch[i][:, j : j + 1], weight, axes=1) elif self.params[feat] == "item-full": - s_i_u = tf.tensordot(sessions_batch[i][:, j : j + 1], weight, axes=1) + s_i_u = tf.tensordot(contexts_batch[i][:, j : j + 1], weight, axes=1) else: raise NotImplementedError(f"Param {self.params[feat]} not implemented") - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) else: print( f"Feature {feat} is in dataset but has no weight assigned in utility \ computations" ) - # Session Items features - for i, feat_tuple in enumerate(self._sessions_items_features_names): + # context Items features + for i, feat_tuple in enumerate(self._contexts_items_features_names): for j, (feat, k) in enumerate(feat_tuple): if feat in self.params.keys(): weight = self.weights[k] if self.params[feat] == "constant": - s_i_u = tf.multiply(sessions_items_batch[i][:, :, j], weight) + s_i_u = tf.multiply(contexts_items_batch[i][:, :, j], weight) elif self.params[feat] == "item": weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) - s_i_u = tf.multiply(sessions_items_batch[i][:, :, j], weight) + s_i_u = tf.multiply(contexts_items_batch[i][:, :, j], weight) elif self.params[feat] == "item-full": - s_i_u = tf.multiply(sessions_items_batch[i][:, :, j], weight) + s_i_u = tf.multiply(contexts_items_batch[i][:, :, j], weight) else: raise NotImplementedError(f"Param {self.params[feat]} not implemented") - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) else: print( f"Feature {feat} is in dataset but has no weight assigned in utility \ @@ -813,17 +885,17 @@ def compute_utility_from_dict( if "intercept" in self.params.keys(): weight = self.weights[-1] if self.params["intercept"] == "constant": - s_i_u = tf.concat([tf.concat([weight] * num_items, axis=0)] * num_sessions, axis=0) + s_i_u = tf.concat([tf.concat([weight] * num_items, axis=0)] * num_choices, axis=0) elif self.params["intercept"] == "item": weight = tf.concat([tf.constant([[0.0]]), weight], axis=-1) - s_i_u = tf.concat([weight] * num_sessions, axis=0) + s_i_u = tf.concat([weight] * num_choices, axis=0) elif self.params["intercept"] == "item-full": - s_i_u = tf.concat([weight] * num_sessions, axis=0) + s_i_u = tf.concat([weight] * num_choices, axis=0) else: raise NotImplementedError(f"Param {self.params[feat]} not implemented") - sessions_items_utilities.append(s_i_u) + contexts_items_utilities.append(s_i_u) - return tf.reduce_sum(sessions_items_utilities, axis=0) + return tf.reduce_sum(contexts_items_utilities, axis=0) def fit(self, choice_dataset, **kwargs): """Main fit function to estimate the paramters. @@ -845,9 +917,9 @@ def fit(self, choice_dataset, **kwargs): else: self.weights = self.instantiate( num_items=choice_dataset.get_num_items(), - items_features_names=choice_dataset.items_features_names, - sessions_features_names=choice_dataset.sessions_features_names, - sessions_items_features_names=choice_dataset.sessions_items_features_names, + items_features_names=choice_dataset.fixed_items_features_names, + contexts_features_names=choice_dataset.contexts_features_names, + contexts_items_features_names=choice_dataset.contexts_items_features_names, ) self.instantiated = True return super().fit(choice_dataset=choice_dataset, **kwargs) @@ -876,9 +948,9 @@ def _fit_with_lbfgs(self, choice_dataset, n_epochs, tolerance=1e-8): else: self.weights = self.instantiate( num_items=choice_dataset.get_num_items(), - items_features_names=choice_dataset.items_features_names, - sessions_features_names=choice_dataset.sessions_features_names, - sessions_items_features_names=choice_dataset.sessions_items_features_names, + items_features_names=choice_dataset.fixed_items_features_names, + contexts_features_names=choice_dataset.contexts_features_names, + contexts_items_features_names=choice_dataset.contexts_items_features_names, ) self.instantiated = True return super()._fit_with_lbfgs(choice_dataset, n_epochs, tolerance) From 0a9a7679088e50d1d9520ad0771a58e0dc0c8567 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 14:15:39 +0100 Subject: [PATCH 16/22] FIX: from_df --- choice_learn/data/choice_dataset.py | 54 +++++++++++++++-------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index a261103a..2a341de4 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -90,7 +90,7 @@ def __init__( # items_features is already a tuple, names are given, checking consistency if fixed_items_features_names is not None: for f, name in zip(fixed_items_features, fixed_items_features_names): - if len(f) != len(name): + if len(f[0]) != len(name): raise ValueError( "items_features shape and items_features_names shape do not match" ) @@ -106,10 +106,12 @@ def __init__( if not isinstance(contexts_features, tuple): self._return_contexts_features_tuple = False if contexts_items_features_names is not None: - if len(contexts_features[0]) == len(contexts_features_names): + if len(contexts_features[0]) != len(contexts_features_names): raise ValueError( - """Number of features given does not match - number of features names given.""" + f"""Number of features given does not match + number of features names given: + {len(contexts_features[0])} and + {len(contexts_features_names)}""" ) contexts_features_names = (contexts_features_names,) @@ -137,8 +139,10 @@ def __init__( if contexts_items_features_names is not None: if len(contexts_items_features[0][0]) != len(contexts_items_features_names): raise ValueError( - """Number of features given does not match - number of features names given for contexts_items.""" + f"""Number of features given does not match + number of features names given for contexts_items: + {len(contexts_items_features[0][0])} and + {len(contexts_items_features_names)}""" ) contexts_items_features = (contexts_items_features,) contexts_items_features_names = (contexts_items_features_names,) @@ -195,8 +199,8 @@ def __init__( for i, feature in enumerate(contexts_features): if isinstance(feature, pd.DataFrame): # Ordering choices by id ? - if "session_id" in feature.columns: - feature = feature.set_index("session_id") + if "context_id" in feature.columns: + feature = feature.set_index("context_id") contexts_features = ( contexts_features[:i] + (feature.loc[np.sort(feature.index)].to_numpy(),) @@ -216,13 +220,13 @@ def __init__( for i, feature in enumerate(contexts_items_features): if isinstance(feature, pd.DataFrame): # Ordering choices by id ? - if "session_id" in feature.columns: + if "context_id" in feature.columns: if "item_id" in feature.columns: feature_array = [] - for sess in np.sort(feature.session_id.unique()): - sess_df = feature.loc[feature.session_id == sess] + for sess in np.sort(feature.context_id.unique()): + sess_df = feature.loc[feature.context_id == sess] sess_df = sess_df[ - sess_df.columns.difference(["session_id"]) + sess_df.columns.difference(["context_id"]) ].set_index("item_id") feature_array.append(sess_df.loc[np.sort(sess_df.index)].to_numpy()) contexts_items_features = ( @@ -236,7 +240,7 @@ def __init__( + contexts_items_features_names[i + 1 :] ) else: - feature = feature.set_index("session_id") + feature = feature.set_index("context_id") contexts_items_features = ( contexts_items_features[:i] + (feature.loc[np.sort(feature.index)].to_numpy(),) @@ -248,7 +252,7 @@ def __init__( + contexts_items_features_names[i + 1 :] ) else: - raise ValueError("session_id column not found in contexts_items_features") + raise ValueError("context_id column not found in contexts_items_features") elif isinstance(feature, list): contexts_items_features = ( contexts_items_features[:i] @@ -261,18 +265,18 @@ def __init__( contexts_items_availabilities, dtype=object ) elif isinstance(contexts_items_availabilities, pd.DataFrame): - if "session_id" in contexts_items_availabilities.columns: + if "context_id" in contexts_items_availabilities.columns: if "item_id" in contexts_items_availabilities.columns: av_array = [] - for sess in np.sort(contexts_items_availabilities.session_id): + for sess in np.sort(contexts_items_availabilities.context_id): sess_df = contexts_items_availabilities.loc[ - contexts_items_availabilities.session_id == sess + contexts_items_availabilities.context__id == sess ] sess_df = sess_df.set_index("item_id") av_array.append(sess_df.loc[np.sort(sess_df.index)].to_numpy()) contexts_items_availabilities = np.array(av_array) else: - feature = feature.set_index("session_id") + feature = feature.set_index("context_id") contexts_items_availabilities = contexts_items_availabilities.loc[ np.sort(feature.index) ].to_numpy() @@ -687,7 +691,7 @@ def from_single_df( items_id_column="item_id", contexts_id_column="context_id", choices_column="choice", - choice_mode="items_name", + choice_mode="items_id", ): """Builds numpy arrays for ChoiceDataset from a single dataframe. @@ -709,7 +713,7 @@ def from_single_df( Name of the column containing the choices, default is "choice" choice_mode: str, optional How choice is indicated in df, either "items_name" or "one_zero", - default is "items_name" + default is "items_id" Returns: ------- @@ -757,7 +761,7 @@ def from_single_df( else None ) - if choice_mode == "item_id": + if choice_mode == "items_id": choices = df[[choices_column, contexts_id_column]].drop_duplicates(contexts_id_column) choices = choices.set_index(contexts_id_column) choices = choices.loc[sessions].to_numpy() @@ -766,7 +770,7 @@ def from_single_df( elif choice_mode == "one_zero": choices = df[[items_id_column, choices_column, contexts_id_column]] choices = choices.loc[choices[choices_column] == 1] - choices = choices = choices.set_index(contexts_id_column) + choices = choices.set_index(contexts_id_column) choices = ( choices.loc[sessions][items_id_column] .map({k: v for v, k in enumerate(items)}) @@ -774,7 +778,7 @@ def from_single_df( ) else: raise ValueError( - f"choice_mode {choice_mode} not recognized. Must be in ['item_id', 'one_zero']" + f"choice_mode {choice_mode} not recognized. Must be in ['items_id', 'one_zero']" ) return ChoiceDataset( fixed_items_features=items_features, @@ -1067,7 +1071,7 @@ def batch(self): """Indexer. Corresponds to get_choice_batch, but with [] logic.""" return self.indexer - def iter_batch(self, batch_size, shuffle=None, sample_weight=None): + def iter_batch(self, batch_size, shuffle=False, sample_weight=None): """Iterates over dataset return batches of length batch_size. Newer version. @@ -1081,8 +1085,6 @@ def iter_batch(self, batch_size, shuffle=None, sample_weight=None): sample_weight : Iterable list of weights to be returned with the right indexing during the shuffling """ - if shuffle is None: - shuffle = self.shuffle if batch_size == -1: batch_size = len(self) # Get indexes for each choice From dd4697c0e1b756654a3b0488e179f41cea5329b8 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 14:16:08 +0100 Subject: [PATCH 17/22] ADD: notebook with all different instantiation possibilities --- notebooks/dataset_creation.ipynb | 256 +++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 notebooks/dataset_creation.ipynb diff --git a/notebooks/dataset_creation.ipynb b/notebooks/dataset_creation.ipynb new file mode 100644 index 00000000..9fb255a1 --- /dev/null +++ b/notebooks/dataset_creation.ipynb @@ -0,0 +1,256 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## All possible ways to create a ChoiceDataset\n", + "\n", + "Listed below ! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "sys.path.append(\"../\")\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from choice_learn.data import ChoiceDataset\n", + "from choice_learn.data.storage import FeaturesStorage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use the CanadaMode dataset for this example. We can download it directly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from choice_learn.datasets import load_modecanada, load_swissmetro\n", + "\n", + "canada_transport_df = load_modecanada(as_frame=True)\n", + "print(canada_transport_df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a column indicating whether the considered transport alternative is individual or not transport." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "canada_transport_df[\"is_individual\"] = canada_transport_df.apply(lambda row: 1 if row.alt ==\"car\" else 0,\n", + " axis=1)\n", + "print(canada_transport_df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From a single dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ChoiceDataset.from_single_df(df=canada_transport_df,\n", + " fixed_items_features_columns=[\"is_individual\"],\n", + " contexts_features_columns=[\"income\"],\n", + " contexts_items_features_columns=[\"dist\", \"cost\", \"ivt\", \"ovt\"],\n", + " items_id_column=\"alt\",\n", + " contexts_id_column=\"case\",\n", + " choices_column=\"choice\",\n", + " choice_mode=\"one_zero\", # the choice columns indicates if the item is chosen (1) or not (0)\n", + " )\n", + "print(dataset.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another mode is possible, if the dataframe indicates the name of the chosen item instead of ones and zeros:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "id_df = canada_transport_df.copy(deep=True)\n", + "one_hot_choice = [0] * len(id_df)\n", + "for n_row, row in id_df.iterrows():\n", + " if row.choice == 0:\n", + " sub_df = id_df[id_df.case == row.case]\n", + " choice = sub_df.loc[sub_df.choice == 1].alt.to_numpy()[0]\n", + " one_hot_choice[n_row-1] = choice\n", + "\n", + "for n_row, row in id_df.iterrows():\n", + " if row.choice == 1:\n", + " one_hot_choice[n_row-1] = row.alt\n", + "\n", + "id_df[\"one_hot_choice\"] = one_hot_choice\n", + "\n", + "print(id_df.head())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ChoiceDataset.from_single_df(df=id_df,\n", + " fixed_items_features_columns=[\"is_individual\"],\n", + " contexts_features_columns=[\"income\"],\n", + " contexts_items_features_columns=[\"dist\", \"cost\", \"ivt\", \"ovt\"],\n", + " items_id_column=\"alt\",\n", + " contexts_id_column=\"case\",\n", + " choices_column=\"one_hot_choice\",\n", + " choice_mode=\"items_id\", # the choice columns indicates if the item is chosen (1) or not (0)\n", + " )\n", + "print(dataset.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's say that you have your data split into several files:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "canada_transport_df.alt.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fixed_items_features = pd.DataFrame({\"item_id\": [\"car\", \"train\", \"bus\", \"air\"],\n", + " \"is_individual\": [1, 0, 0, 0]})\n", + "# The item_id column is necessery, otherwise it will keep the order\n", + "# however it is less safe with pd.DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "contexts_features = canada_transport_df[[\"case\", \"income\"]].drop_duplicates()\n", + "contexts_features = contexts_features.rename(columns={\"case\": \"context_id\"})\n", + "# If the context_id column does not exist, the index is used" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "contexts_items_features = canada_transport_df[[\"case\", \"alt\", \"dist\", \"cost\", \"ivt\", \"ovt\"]]\n", + "contexts_items_features = contexts_items_features.rename(columns={\"case\": \"context_id\", \"alt\": \"item_id\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "canada_transport_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "choices = canada_transport_df.loc[canada_transport_df.choice==1][[\"case\", \"alt\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ChoiceDataset(fixed_items_features=fixed_items_features,\n", + " contexts_features=contexts_features,\n", + " contexts_items_features=contexts_items_features,\n", + " choices=choices)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tf_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 36bfd6972e3b891db18d8dbe8754986a72cda00b Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 14:16:59 +0100 Subject: [PATCH 18/22] small changes --- .../choice_learn_introduction_clogit.ipynb | 127 ++++++++---------- notebooks/features_storage_example.ipynb | 64 ++++++++- 2 files changed, 122 insertions(+), 69 deletions(-) diff --git a/notebooks/choice_learn_introduction_clogit.ipynb b/notebooks/choice_learn_introduction_clogit.ipynb index 9aea031e..85c345f8 100644 --- a/notebooks/choice_learn_introduction_clogit.ipynb +++ b/notebooks/choice_learn_introduction_clogit.ipynb @@ -70,11 +70,11 @@ "source": [ "from choice_learn.data import ChoiceDataset\n", "dataset = ChoiceDataset.from_single_df(df=transport_df,\n", - " items_features_columns=[\"oh_air\", \"oh_bus\", \"oh_car\", \"oh_train\"],\n", - " sessions_features_columns=[\"income\"],\n", - " sessions_items_features_columns=[\"cost\", \"freq\", \"ovt\", \"ivt\"],\n", + " fixed_items_features_columns=[\"oh_air\", \"oh_bus\", \"oh_car\", \"oh_train\"],\n", + " contexts_features_columns=[\"income\"],\n", + " contexts_items_features_columns=[\"cost\", \"freq\", \"ovt\", \"ivt\"],\n", " items_id_column=\"alt\",\n", - " sessions_id_column=\"case\",\n", + " contexts_id_column=\"case\",\n", " choices_column=\"choice\",\n", " choice_mode=\"one_zero\")" ] @@ -92,7 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "from choice_learn.models import ConditionalMNL, ModelSpecification" + "from choice_learn.models import ConditionalMNL" ] }, { @@ -138,25 +138,25 @@ "metadata": {}, "outputs": [], "source": [ - "# Instantiation of the specification\n", - "spec = ModelSpecification()\n", + "# Initialization of the model\n", + "model = ConditionalMNL(optimizer=\"lbfgs\")\n", "\n", "# Creation of the different weights:\n", "\n", "\n", "# add_coefficients adds one coefficient for each specified item_index\n", "# intercept, and income are added for each item except the first one that needs to be zeroed\n", - "spec.add_coefficients(coefficient_name=\"beta_inter\", feature_name=\"intercept\", items_indexes=[1, 2, 3])\n", - "spec.add_coefficients(coefficient_name=\"beta_income\", feature_name=\"income\", items_indexes=[1, 2, 3])\n", + "model.add_coefficients(coefficient_name=\"beta_inter\", feature_name=\"intercept\", items_indexes=[1, 2, 3])\n", + "model.add_coefficients(coefficient_name=\"beta_income\", feature_name=\"income\", items_indexes=[1, 2, 3])\n", "\n", "# ivt is added for each item:\n", - "spec.add_coefficients(coefficient_name=\"beta_ivt\", feature_name=\"ivt\", items_indexes=[0, 1, 2, 3])\n", + "model.add_coefficients(coefficient_name=\"beta_ivt\", feature_name=\"ivt\", items_indexes=[0, 1, 2, 3])\n", "\n", "# shared_coefficient add one coefficient that is used for all items specified in the items_indexes:\n", "# Here, cost, freq and ovt coefficients are shared between all items\n", - "spec.add_shared_coefficient(coefficient_name=\"beta_cost\", feature_name=\"cost\", items_indexes=[0, 1, 2, 3])\n", - "spec.add_shared_coefficient(coefficient_name=\"beta_freq\", feature_name=\"freq\", items_indexes=[0, 1, 2, 3])\n", - "spec.add_shared_coefficient(coefficient_name=\"beta_ovt\", feature_name=\"ovt\", items_indexes=[0, 1, 2, 3])" + "model.add_shared_coefficient(coefficient_name=\"beta_cost\", feature_name=\"cost\", items_indexes=[0, 1, 2, 3])\n", + "model.add_shared_coefficient(coefficient_name=\"beta_freq\", feature_name=\"freq\", items_indexes=[0, 1, 2, 3])\n", + "model.add_shared_coefficient(coefficient_name=\"beta_ovt\", feature_name=\"ovt\", items_indexes=[0, 1, 2, 3])" ] }, { @@ -166,20 +166,27 @@ "Now, we can instantiate our ConditionalMNL from the specification. We use LBFGS as the estimation method." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to estimate the the coefficients values, use the .fit method with the ChoiceDataset:" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "cmnl = ConditionalMNL(spec, optimizer=\"lbfgs\")" + "history = model.fit(dataset, n_epochs=1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In order to estimate the the coefficients values, use the .fit method with the ChoiceDataset:" + "It is possible to see the estimated coefficients with the .weights argument:" ] }, { @@ -188,14 +195,14 @@ "metadata": {}, "outputs": [], "source": [ - "history = cmnl.fit(dataset, n_epochs=1000)" + "model.weights" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "It is possible to see the estimated coefficients with the .weights argument:" + "The negative loglikelihood can be estimated using .evaluate():" ] }, { @@ -204,14 +211,23 @@ "metadata": {}, "outputs": [], "source": [ - "cmnl.weights" + "print(\"The average neg-loglikelihood is:\", model.evaluate(dataset).numpy())\n", + "print(\"The total neg-loglikelihood is:\", model.evaluate(dataset).numpy()*len(dataset))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The negative loglikelihood can be estimated using .evaluate():" + "A faster specification can be done using a dictionnary. It follows torch-choice \\ref{} method to create conditional logit models.\n", + "The parameters dict needs to be as follows:\n", + "- The key is the feature name\n", + "- The value is the mode. Currently three modes are available:\n", + " - constant: the learned coefficient is shared by all items\n", + " - item: one coefficient by item is estimated, the value for the item at index 0 is set to 0\n", + " - item-full: one coefficient by item is estimated\n", + "\n", + "In order to create the same model for the ModeCanada dataset, it looks as follows:" ] }, { @@ -220,8 +236,26 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"The average neg-loglikelihood is:\", cmnl.evaluate(dataset).numpy())\n", - "print(\"The total neg-loglikelihood is:\", cmnl.evaluate(dataset).numpy()*len(dataset))" + "# Instantiation of the parameters dictionnary\n", + "params = {\"income\": \"item\",\n", + " \"cost\": \"constant\", \n", + " \"freq\": \"constant\",\n", + " \"ovt\": \"constant\", \n", + " \"ivt\": \"item-full\",\n", + " \"intercept\": \"item\"}\n", + "\n", + "# Instantiation of the model\n", + "cmnl = ConditionalMNL(parameters=params, optimizer=\"lbfgs\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "history = cmnl.fit(dataset, n_epochs=1000)\n", + "print(cmnl.weights)" ] }, { @@ -241,14 +275,14 @@ "\n", "# Here are the values obtained in the references:\n", "gt_weights = [\n", - " tf.constant([[0.697311, 1.8437, 3.27381]]),\n", " tf.constant([[-0.0890796, -0.0279925, -0.038146]]),\n", - " tf.constant([[0.0595089, -0.00678188, -0.00645982, -0.00145029]]),\n", " tf.constant([[-0.0333421]]),\n", " tf.constant([[0.0925304]]),\n", " tf.constant([[-0.0430032]]),\n", + " tf.constant([[0.0595089, -0.00678188, -0.00645982, -0.00145029]]),\n", + " tf.constant([[0.697311, 1.8437, 3.27381]]),\n", "]\n", - "gt_model = ConditionalMNL(spec, lr=0.01)\n", + "gt_model = ConditionalMNL(parameters=params, lr=0.01)\n", "gt_model.fit(dataset, n_epochs=1, batch_size=-1)\n", "\n", "# Here we estimate the negative log-likelihood with these coefficients (also, we obtain same value as in those papers):\n", @@ -298,7 +332,7 @@ "metadata": {}, "outputs": [], "source": [ - "cmnl = ConditionalMNL(spec, optimizer=\"Adam\")\n", + "cmnl = ConditionalMNL(parameters=params, optimizer=\"Adam\")\n", "history = cmnl.fit(dataset, n_epochs=2000, batch_size=-1)\n", "cmnl.optimizer.lr = cmnl.optimizer.lr / 5\n", "history2 = cmnl.fit(dataset, n_epochs=4000, batch_size=-1)\n", @@ -324,49 +358,6 @@ "cmnl.evaluate(dataset)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A faster specification can be done using a dictionnary. It follows torch-choice \\ref{} method to create conditional logit models.\n", - "The parameters dict needs to be as follows:\n", - "- The key is the feature name\n", - "- The value is the mode. Currently three modes are available:\n", - " - constant: the learned coefficient is shared by all items\n", - " - item: one coefficient by item is estimated, the value for the item at index 0 is set to 0\n", - " - item-full: one coefficient by item is estimated\n", - "\n", - "In order to create the same model for the ModeCanada dataset, it looks as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Instantiation of the parameters dictionnary\n", - "params = {\"income\": \"item\",\n", - " \"cost\": \"constant\", \n", - " \"freq\": \"constant\",\n", - " \"ovt\": \"constant\", \n", - " \"ivt\": \"item-full\",\n", - " \"intercept\": \"item\"}\n", - "\n", - "# Instantiation of the model\n", - "cmnl = ConditionalMNL(parameters=params, optimizer=\"lbfgs\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "history = cmnl.fit(dataset, n_epochs=1000)\n", - "print(cmnl.weights)" - ] - }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/features_storage_example.ipynb b/notebooks/features_storage_example.ipynb index 4ed33770..43200b9b 100644 --- a/notebooks/features_storage_example.ipynb +++ b/notebooks/features_storage_example.ipynb @@ -26,7 +26,7 @@ "metadata": {}, "outputs": [], "source": [ - "from choice_learn.data.storage import FeaturesStorage" + "from choice_learn.data.storage import FeaturesStorage, OneHotStorage" ] }, { @@ -107,6 +107,68 @@ "storage.batch[[\"customerA\", \"customerC\", \"customerA\", \"customerC\"]]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ids = [0, 1, 2, 3, 4]\n", + "values = [4, 3, 2, 1, 0]\n", + "\n", + "oh_storage = OneHotStorage(ids=ids, values=values, name=\"OneHotTest\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oh_storage.batch[[0, 2, 4]], oh_storage.get_element_from_index(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oh_storage = OneHotStorage(values=values, name=\"OneHotTest\")\n", + "oh_storage.batch[[0, 2, 4]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oh_storage = OneHotStorage(ids=ids, name=\"OneHotTest\")\n", + "oh_storage.batch[[0, 2, 4]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "values_dict = {k:v for k, v in zip(ids, values)}\n", + "oh_storage = OneHotStorage(values=values_dict, name=\"OneHotTest\")\n", + "oh_storage.batch[[0, 2, 4]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "oh_storage = OneHotStorage(ids=ids, values=values_dict, name=\"OneHotTest\")\n", + "oh_storage = OneHotStorage(name=\"OneHotTest\")" + ] + }, { "cell_type": "code", "execution_count": null, From 6b23a071c458ee8d5054f1da45d67d1802696260 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 14:24:00 +0100 Subject: [PATCH 19/22] test bandit config --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cc399e36..f4693f3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,5 +64,5 @@ quote-style = "double" [tool.ruff.isort] known-first-party = ["choice_learn", "config", "tests"] -[tool.bandit] +[tool.bandit.assert_used] exclude_dirs = ["tests/"] From 2ab676b6b005158171c5d5dc71b0bdbdf6652a1c Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 15:52:00 +0100 Subject: [PATCH 20/22] FIX: few important fixes --- choice_learn/data/choice_dataset.py | 45 ++++++++++++++++++++++------- choice_learn/data/indexer.py | 3 +- choice_learn/data/storage.py | 8 +++-- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/choice_learn/data/choice_dataset.py b/choice_learn/data/choice_dataset.py index 2a341de4..b03b17c4 100644 --- a/choice_learn/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -67,6 +67,7 @@ def __init__( if choices is None: # Done to keep a logical order of arguments, and has logic: choices have to be specified raise ValueError("Choices must be specified, got None") + choices = np.array(choices) # --------- [ Handling features type given as tuples or not ] --------- # # If items_features is not given as tuple, transform it internally as a tuple @@ -766,7 +767,7 @@ def from_single_df( choices = choices.set_index(contexts_id_column) choices = choices.loc[sessions].to_numpy() # items is the value (str) of the item - choices = [np.where(items == c)[0] for c in choices] + choices = np.squeeze([np.where(items == c)[0] for c in choices]) elif choice_mode == "one_zero": choices = df[[items_id_column, choices_column, contexts_id_column]] choices = choices.loc[choices[choices_column] == 1] @@ -1048,21 +1049,45 @@ def __getitem__(self, choices_indexes): elif isinstance(choices_indexes, slice): return self.__getitem__(list(range(*choices_indexes.indices(len(self.choices))))) - return ChoiceDataset( - fixed_items_features=self.fixed_items_features, - contexts_features=tuple( + if self.fixed_items_features[0] is None: + fixed_items_features = None + else: + fixed_items_features = self.fixed_items_features + if self.contexts_features[0] is None: + contexts_features = None + else: + contexts_features = tuple( self.contexts_features[i][choices_indexes] for i in range(len(self.contexts_features)) - ), - contexts_items_features=tuple( + ) + if self.contexts_items_features[0] is None: + contexts_items_features = None + else: + contexts_items_features = tuple( self.contexts_items_features[i][choices_indexes] for i in range(len(self.contexts_items_features)) - ), + ) + if self.fixed_items_features_names[0] is None: + fixed_items_features_names = None + else: + fixed_items_features_names = self.fixed_items_features_names + if self.contexts_features_names[0] is None: + contexts_features_names = None + else: + contexts_features_names = self.contexts_features_names + if self.contexts_items_features_names[0] is None: + contexts_items_features_names = None + else: + contexts_items_features_names = self.contexts_items_features_names + return ChoiceDataset( + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, contexts_items_availabilities=self.contexts_items_availabilities[choices_indexes], choices=[self.choices[i] for i in choices_indexes], - fixed_items_features_names=self.fixed_items_features_names, - contexts_features_names=self.contexts_features_names, - contexts_items_features_names=self.contexts_items_features_names, + fixed_items_features_names=fixed_items_features_names, + contexts_features_names=contexts_features_names, + contexts_items_features_names=contexts_items_features_names, features_by_ids=self.features_by_ids, ) diff --git a/choice_learn/data/indexer.py b/choice_learn/data/indexer.py index 6c9137dc..d070ee09 100644 --- a/choice_learn/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -1,6 +1,5 @@ """Indexer classes for data classes.""" from abc import abstractmethod -from collections.abc import Iterable import numpy as np @@ -92,7 +91,7 @@ def __getitem__(self, sequence_keys): array_like features corresponding to the sequence_keys """ - if isinstance(sequence_keys, Iterable): + if isinstance(sequence_keys, list) or isinstance(sequence_keys, np.ndarray): return np.array([self.storage.storage[key] for key in sequence_keys]) if isinstance(sequence_keys, slice): raise ValueError("Slicing is not supported for storage") diff --git a/choice_learn/data/storage.py b/choice_learn/data/storage.py index 365f84de..d38ef03e 100644 --- a/choice_learn/data/storage.py +++ b/choice_learn/data/storage.py @@ -115,8 +115,10 @@ def get_element_from_index(self, index): array_like features corresponding to the index index in self.store """ - keys = list(self.storage.keys())[index] - return self.storage[keys] + if isinstance(index, int): + index = [index] + keys = [list(self.storage.keys())[i] for i in index] + return self.batch[keys] def __len__(self): """Returns the length of the sequence of apparition of the features.""" @@ -135,6 +137,8 @@ def __getitem__(self, id_keys): FeaturesStorage Subset of the FeaturesStorage, with only the features whose id is in id_keys """ + if not isinstance(id_keys, list): + id_keys = [id_keys] sub_storage = {k: self.storage[k] for k in id_keys} return FeaturesStorage(values=sub_storage, values_names=self.values_names, name=self.name) From a5109917559ec2d7e6e458206490eda434b52052 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 15:53:54 +0100 Subject: [PATCH 21/22] ADD: tests with new signature --- notebooks/features_storage_example.ipynb | 1 - tests/unit_tests/data/test_choice_dataset.py | 226 +++++++++--------- tests/unit_tests/data/test_store.py | 237 ++++++++++++------- 3 files changed, 263 insertions(+), 201 deletions(-) diff --git a/notebooks/features_storage_example.ipynb b/notebooks/features_storage_example.ipynb index 43200b9b..5e8df68a 100644 --- a/notebooks/features_storage_example.ipynb +++ b/notebooks/features_storage_example.ipynb @@ -36,7 +36,6 @@ "outputs": [], "source": [ "features = {\"customerA\": [1, 2, 3], \"customerB\": [4, 5, 6], \"customerC\": [7, 8, 9]}\n", - "\n", "storage = FeaturesStorage(values=features, values_names=[\"age\", \"income\", \"children_nb\"], name=\"customers\")" ] }, diff --git a/tests/unit_tests/data/test_choice_dataset.py b/tests/unit_tests/data/test_choice_dataset.py index 0ef46728..b288cde3 100644 --- a/tests/unit_tests/data/test_choice_dataset.py +++ b/tests/unit_tests/data/test_choice_dataset.py @@ -4,7 +4,7 @@ from choice_learn.data.choice_dataset import ChoiceDataset -items_features = [ +fixed_items_features = [ [1, 2], # item 1 [size, weight] [2, 4], # item 2 [size, weight] [1.5, 1.5], # item 3 [size, weight] @@ -17,19 +17,19 @@ # Customer 2 bought item 3 at session 2 choices = [0, 2, 1] -sessions_items_availabilities = [ +contexts_items_availabilities = [ [1, 1, 1], # All items available at session 1 [1, 1, 1], # All items available at session 2 [0, 1, 1], # Item 1 not available at session 3 ] -sessions_features = [ +contexts_features = [ [100, 20], # session 1, customer 1 [budget, age] [200, 40], # session 2, customer 2 [budget, age] [80, 20], # session 3, customer 1 [budget, age] ] -sessions_items_features = [ +contexts_items_features = [ [ [100, 0], # Session 1, Item 1 [price, promotion] [140, 0], # Session 1, Item 2 [price, promotion] @@ -53,19 +53,10 @@ def test_instantiate_len(): """Test the __init__ method.""" choices = [0, 2, 1] dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, - choices=choices, - ) - assert len(dataset) == 3 - choices = [[0], [1, 2], [2, 1, 1, 1]] - dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) assert len(dataset) == 3 @@ -76,10 +67,10 @@ def test_fail_instantiate(): choices = [0, 1] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -93,10 +84,10 @@ def test_fail_instantiate_2(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -109,10 +100,10 @@ def test_fail_instantiate_3(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -125,10 +116,10 @@ def test_fail_instantiate_10(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=sessions_items_availabilities, choices=choices, ) @@ -142,10 +133,10 @@ def test_fail_instantiate_4(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=sessions_items_availabilities, choices=choices, ) @@ -158,10 +149,10 @@ def test_fail_instantiate_5(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=sessions_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -175,10 +166,10 @@ def test_fail_instantiate_6(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=sessions_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -199,10 +190,10 @@ def test_fail_instantiate_7(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=sessions_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -226,10 +217,10 @@ def test_fail_instantiate_8(): ] with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=sessions_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -239,10 +230,10 @@ def test_fail_instantiate_9(): choices = [0, 4, 2] # choices higher than nb of items with pytest.raises(ValueError): ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) @@ -250,15 +241,14 @@ def test_fail_instantiate_9(): def test_shape(): """Tests get shape methods.""" dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) assert dataset.get_num_items() == 3 - assert dataset.get_num_sessions() == 3 assert dataset.get_num_choices() == 3 @@ -269,7 +259,7 @@ def test_from_df(): "item_id": [0, 1, 2, 0, 1, 2, 1, 2], "items_feat_1": [1, 2, 1.5, 1, 2, 1.5, 2, 1.5], "items_feat_2": [2, 4, 1.5, 2, 4, 1.5, 4, 1.5], - "session_id": [0, 0, 0, 1, 1, 1, 2, 2], + "context_id": [0, 0, 0, 1, 1, 1, 2, 2], "session_feat_1": [100, 100, 100, 200, 200, 200, 80, 80], "session_feat_2": [20, 20, 20, 40, 40, 40, 20, 20], "session_item_feat_1": [100, 140, 200, 100, 120, 200, 120, 180], @@ -279,27 +269,29 @@ def test_from_df(): ) cd_test = ChoiceDataset.from_single_df( features_df, - items_features_columns=["items_feat_1", "items_feat_2"], - sessions_features_columns=["session_feat_1", "session_feat_2"], - sessions_items_features_columns=["session_item_feat_1", "session_item_feat_2"], - choice_mode="item_id", + fixed_items_features_columns=["items_feat_1", "items_feat_2"], + contexts_features_columns=["session_feat_1", "session_feat_2"], + contexts_items_features_columns=["session_item_feat_1", "session_item_feat_2"], + choice_mode="items_id", ) ground_truth_cd = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) - assert (cd_test.items_features[0] == ground_truth_cd.items_features[0]).all() - assert (cd_test.sessions_features[0] == ground_truth_cd.sessions_features[0]).all() + assert (cd_test.fixed_items_features[0] == ground_truth_cd.fixed_items_features[0]).all() + assert (cd_test.contexts_features[0] == ground_truth_cd.contexts_features[0]).all() assert ( - cd_test.sessions_items_features[0].astype("float32") - == ground_truth_cd.sessions_items_features[0].astype("float32") + cd_test.contexts_items_features[0].astype("float32") + == ground_truth_cd.contexts_items_features[0].astype("float32") ).all() assert ( - cd_test.sessions_items_availabilities == ground_truth_cd.sessions_items_availabilities + cd_test.contexts_items_availabilities == ground_truth_cd.contexts_items_availabilities ).all() + print(cd_test.choices) + print(cd_test.fixed_items_features) assert (cd_test.choices == ground_truth_cd.choices).all() features_df = pd.DataFrame( @@ -307,7 +299,7 @@ def test_from_df(): "item_id": [0, 1, 2, 0, 1, 2, 1, 2], "items_feat_1": [1, 2, 1.5, 1, 2, 1.5, 2, 1.5], "items_feat_2": [2, 4, 1.5, 2, 4, 1.5, 4, 1.5], - "session_id": [0, 0, 0, 1, 1, 1, 2, 2], + "context_id": [0, 0, 0, 1, 1, 1, 2, 2], "session_feat_1": [100, 100, 100, 200, 200, 200, 80, 80], "session_feat_2": [20, 20, 20, 40, 40, 40, 20, 20], "session_item_feat_1": [100, 140, 200, 100, 120, 200, 120, 180], @@ -317,26 +309,26 @@ def test_from_df(): ) cd_test = ChoiceDataset.from_single_df( features_df, - items_features_columns=["items_feat_1", "items_feat_2"], - sessions_features_columns=["session_feat_1", "session_feat_2"], - sessions_items_features_columns=["session_item_feat_1", "session_item_feat_2"], + fixed_items_features_columns=["items_feat_1", "items_feat_2"], + contexts_features_columns=["session_feat_1", "session_feat_2"], + contexts_items_features_columns=["session_item_feat_1", "session_item_feat_2"], choice_mode="one_zero", ) ground_truth_cd = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) - assert (cd_test.items_features[0] == ground_truth_cd.items_features[0]).all() - assert (cd_test.sessions_features[0] == ground_truth_cd.sessions_features[0]).all() + assert (cd_test.fixed_items_features[0] == ground_truth_cd.fixed_items_features[0]).all() + assert (cd_test.contexts_features[0] == ground_truth_cd.contexts_features[0]).all() assert ( - cd_test.sessions_items_features[0].astype("float32") - == ground_truth_cd.sessions_items_features[0].astype("float32") + cd_test.contexts_items_features[0].astype("float32") + == ground_truth_cd.contexts_items_features[0].astype("float32") ).all() assert ( - cd_test.sessions_items_availabilities == ground_truth_cd.sessions_items_availabilities + cd_test.contexts_items_availabilities == ground_truth_cd.contexts_items_availabilities ).all() assert (cd_test.choices == ground_truth_cd.choices).all() @@ -344,10 +336,10 @@ def test_from_df(): def test_summary(): """Tests summary method.""" dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) dataset.summary() @@ -357,21 +349,21 @@ def test_summary(): def test_getitem(): """Tests getitem method.""" dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) sub_dataset = dataset[[0, 1]] - assert (sub_dataset.items_features[0] == dataset.items_features[0]).all() - assert (sub_dataset.sessions_features[0] == dataset.sessions_features[0][[0, 1]]).all() + assert (sub_dataset.fixed_items_features[0] == dataset.fixed_items_features[0]).all() + assert (sub_dataset.contexts_features[0] == dataset.contexts_features[0][[0, 1]]).all() assert ( - sub_dataset.sessions_items_features[0] == dataset.sessions_items_features[0][[0, 1]] + sub_dataset.contexts_items_features[0] == dataset.contexts_items_features[0][[0, 1]] ).all() assert ( - sub_dataset.sessions_items_availabilities == dataset.sessions_items_availabilities[[0, 1]] + sub_dataset.contexts_items_availabilities == dataset.contexts_items_availabilities[[0, 1]] ).all() assert (sub_dataset.choices == dataset.choices[[0, 1]]).all() assert (sub_dataset.choices == [0, 2]).all() @@ -380,17 +372,17 @@ def test_getitem(): def test_batch(): """Tests the batch method.""" dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) batch = dataset.batch[[0, 1]] - assert (batch[0] == items_features).all() - assert (batch[1] == sessions_features[:2]).all() - assert (batch[2] == sessions_items_features[:2]).all() - assert (batch[3] == sessions_items_availabilities[:2]).all() + assert (batch[0] == fixed_items_features).all() + assert (batch[1] == contexts_features[:2]).all() + assert (batch[2] == contexts_items_features[:2]).all() + assert (batch[3] == contexts_items_availabilities[:2]).all() assert (batch[4] == choices[:2]).all() sliced_batch = dataset.batch[:2] @@ -411,10 +403,10 @@ def test_batch(): def test_iter_batch(): """Tests the iter_batch method.""" dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) for batch_nb, batch in enumerate(dataset.iter_batch(batch_size=2)): @@ -429,10 +421,10 @@ def test_iter_batch(): def test_filter(): """Tests the filter method.""" dataset = ChoiceDataset( - items_features=items_features, - sessions_features=sessions_features, - sessions_items_features=sessions_items_features, - sessions_items_availabilities=sessions_items_availabilities, + fixed_items_features=fixed_items_features, + contexts_features=contexts_features, + contexts_items_features=contexts_items_features, + contexts_items_availabilities=contexts_items_availabilities, choices=choices, ) filtered_dataset = dataset.filter([True, False, True]) diff --git a/tests/unit_tests/data/test_store.py b/tests/unit_tests/data/test_store.py index d1fd3df8..9fdd1767 100644 --- a/tests/unit_tests/data/test_store.py +++ b/tests/unit_tests/data/test_store.py @@ -1,133 +1,204 @@ """Test the store module.""" -from choice_learn.data.store import FeaturesStore, OneHotStore, Store +import numpy as np +import pandas as pd + +from choice_learn.data.storage import FeaturesStorage, OneHotStorage def test_len_store(): - """Test the __len__ method of Store.""" - store = Store(values=[1, 2, 3, 4], sequence=[0, 1, 2, 3, 0, 1, 2, 3]) - assert len(store) == 8 + """Test the __len__ method of Storage.""" + features = {"customerA": [1, 2], "customerB": [4, 5], "customerC": [7, 8]} + storage = FeaturesStorage( + values=features, values_names=["age", "income", "children_nb"], name="customers" + ) + assert len(storage) == 3 + assert storage.shape == (3, 2) def test_get_store_element(): """Test the _get_store_element method of Store.""" - store = Store(values=[1, 2, 3, 4], sequence=[0, 1, 2, 3, 0, 1, 2, 3]) - assert store._get_store_element(0) == 1 - assert store._get_store_element([0, 1, 2]) == [1, 2, 3] + features = {"customerA": [1, 2], "customerB": [4, 5], "customerC": [7, 8]} + storage = FeaturesStorage( + values=features, values_names=["age", "income", "children_nb"], name="customers" + ) + assert (storage.get_element_from_index(0) == np.array([1, 2])).all() + assert (storage.get_element_from_index([0, 1, 2]) == np.array([[1, 2], [4, 5], [7, 8]])).all() def test_store_batch(): """Test the batch method of Store.""" - store = Store(values=[1, 2, 3, 4], sequence=[0, 1, 2, 3, 0, 1, 2, 3]) - assert store.batch[1] == 2 - assert store.batch[2:4] == [3, 4] - assert store.batch[[2, 3, 6, 7]] == [3, 4, 3, 4] + features = {"customerA": [1, 2], "customerB": [4, 5], "customerC": [7, 8]} + storage = FeaturesStorage( + values=features, values_names=["age", "income", "children_nb"], name="customers" + ) + assert (storage.batch["customerA"] == np.array([1, 2])).all() + assert ( + storage.batch[["customerA", "customerC", "customerA", "customerC"]] + == np.array([[1, 2], [7, 8], [1, 2], [7, 8]]) + ).all() def test_featuresstore_instantiation(): """Test the instantiation of FeaturesStore.""" - store = FeaturesStore( - values=[[10, 10], [4, 4], [2, 2], [8, 8]], - sequence=[0, 1, 2, 3, 0, 1, 2, 3], - indexes=[0, 1, 2, 3], + features = {"customerA": [1, 2], "customerB": [4, 5], "customerC": [7, 8]} + storage = FeaturesStorage( + values=features, values_names=["age", "income", "children_nb"], name="customers" ) - assert store.shape == (8, 2) - assert [store.sequence[i] == [0, 1, 2, 3, 0, 1, 2, 3][i] for i in range(8)] - assert store.store == {0: [10, 10], 1: [4, 4], 2: [2, 2], 3: [8, 8]} + + for k, v in storage.storage.items(): + assert ( + v + == { + "customerA": np.array([1, 2]), + "customerB": np.array([4, 5]), + "customerC": np.array([7, 8]), + }[k] + ).all() def test_featuresstore_instantiation_indexless(): """Test the instantiation of FeaturesStore.""" - store = FeaturesStore( - values=[[10, 10], [4, 4], [2, 2], [8, 8]], sequence=[0, 1, 2, 3, 0, 1, 2, 3] + features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + ids = ["customerA", "customerB", "customerC"] + + storage = FeaturesStorage( + ids=ids, values=features, values_names=["age", "income", "children_nb"], name="customers" ) - assert store.shape == (8, 2) - assert [store.sequence[i] == [0, 1, 2, 3, 0, 1, 2, 3][i] for i in range(8)] - assert store.store == {0: [10, 10], 1: [4, 4], 2: [2, 2], 3: [8, 8]} + assert storage.shape == (3, 3) + for k, v in storage.storage.items(): + assert ( + v + == { + "customerA": np.array([1, 2, 3]), + "customerB": np.array([4, 5, 6]), + "customerC": np.array([7, 8, 9]), + }[k] + ).all() def test_featuresstore_instantiation_from_list(): """Test the instantiation of FeaturesStore.""" - store = FeaturesStore.from_list( - values_list=[[10, 10], [4, 4], [2, 2], [8, 8]], sequence=[0, 1, 2, 3, 0, 1, 2, 3] + features = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + + storage = FeaturesStorage( + values=features, values_names=["age", "income", "children_nb"], name="customers" ) - assert store.shape == (8, 2) - assert [store.sequence[i] == [0, 1, 2, 3, 0, 1, 2, 3][i] for i in range(8)] - assert store.store == {0: [10, 10], 1: [4, 4], 2: [2, 2], 3: [8, 8]} + storage.batch[[0, 2, 0, 2]] + assert storage.shape == (3, 3) + for k, v in storage.storage.items(): + assert ( + v == {0: np.array([1, 2, 3]), 1: np.array([4, 5, 6]), 2: np.array([7, 8, 9])}[k] + ).all() def test_featuresstore_instantiation_fromdict(): """Test the instantiation of FeaturesStore.""" - store = FeaturesStore.from_dict( - values_dict={0: [10, 10], 1: [4, 4], 2: [2, 2], 3: [8, 8]}, - sequence=[0, 1, 2, 3, 0, 1, 2, 3], - ) - assert store.shape == (8, 2) - assert [store.sequence[i] == [0, 1, 2, 3, 0, 1, 2, 3][i] for i in range(8)] - assert store.store == {0: [10, 10], 1: [4, 4], 2: [2, 2], 3: [8, 8]} + features = { + "age": [1, 4, 7], + "income": [2, 5, 8], + "children_nb": [3, 6, 9], + "id": ["customerA", "customerB", "customerC"], + } + features = pd.DataFrame(features) + storage = FeaturesStorage(values=features, name="customers") + assert storage.shape == (3, 3) + for k, v in storage.storage.items(): + assert ( + v + == { + "customerA": np.array([1, 2, 3]), + "customerB": np.array([4, 5, 6]), + "customerC": np.array([7, 8, 9]), + }[k] + ).all() + + +def test_featuresstore_instantiation_fromdf(): + """Test the instantiation of FeaturesStore.""" + features = {"age": [1, 4, 7], "income": [2, 5, 8], "children_nb": [3, 6, 9]} + features = pd.DataFrame(features, index=["customerA", "customerB", "customerC"]) + storage = FeaturesStorage(values=features, name="customers") + assert storage.shape == (3, 3) + for k, v in storage.storage.items(): + assert ( + v + == { + "customerA": np.array([1, 2, 3]), + "customerB": np.array([4, 5, 6]), + "customerC": np.array([7, 8, 9]), + }[k] + ).all() def test_featuresstore_getitem(): """Test the __getitem__ method of FeaturesStore.""" - store = FeaturesStore.from_dict( - values_dict={0: [10, 10], 1: [4, 4], 2: [2, 2], 3: [8, 8]}, - sequence=[0, 1, 2, 3, 0, 1, 2, 3], + features = {"customerA": [1, 2], "customerB": [4, 5], "customerC": [7, 8]} + storage = FeaturesStorage( + values=features, values_names=["age", "income", "children_nb"], name="customers" ) - sub_store = store[0:3] - assert sub_store.shape == (3, 2) - assert [sub_store.sequence[i] == [0, 1, 2][i] for i in range(3)] - assert sub_store.store == {0: [10, 10], 1: [4, 4], 2: [2, 2]} + sub_storage = storage[["customerA", "customerC"]] + assert sub_storage.shape == (2, 2) + for k, v in {"customerA": np.array([1, 2]), "customerC": np.array([7, 8])}.items(): + print(v, sub_storage.storage[k]) + assert (v == sub_storage.storage[k]).all() def test_onehotstore_instantiation(): """Test the instantiation of OneHotStore.""" - indexes = [0, 1, 2, 4] - values = [0, 1, 2, 3] - sequence = [0, 1, 2, 4, 0, 1, 2, 4] - store = OneHotStore(indexes=indexes, values=values, sequence=sequence) - assert store.shape == (8, 4) - assert [store.sequence[i] == [0, 1, 2, 4, 0, 1, 2, 4][i] for i in range(8)] - assert store.store == {0: 0, 1: 1, 2: 2, 4: 3} + ids = [0, 1, 2, 3, 4] + values = [4, 3, 2, 1, 0] + storage = OneHotStorage(ids=ids, values=values, name="OneHotTest") + assert storage.shape == (5, 5) + assert storage.storage == {0: 4, 1: 3, 2: 2, 3: 1, 4: 0} def test_onehotstore_instantiation_from_sequence(): """Test the instantiation; from_sequence of OneHotStore.""" - sequence = [0, 1, 2, 3, 0, 1, 2, 3] - store = OneHotStore.from_sequence(sequence=sequence) - assert store.shape == (8, 4) - assert [store.sequence[i] == [0, 1, 2, 3, 0, 1, 2, 3][i] for i in range(8)] - assert store.store == {0: 0, 1: 1, 2: 2, 3: 3} + values = [4, 3, 2, 1, 0] + storage = OneHotStorage(values=values, name="OneHotTest") + assert ( + storage.batch[[0, 2, 4]] == np.array([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [1, 0, 0, 0, 0]]) + ).all() + assert storage.storage == {4: 0, 3: 1, 2: 2, 1: 3, 0: 4} -def test_onehotstore_getitem(): - """Test the getitem of OneHotStore.""" - indexes = [0, 1, 2, 4] - values = [0, 1, 2, 3] - sequence = [0, 1, 2, 4, 0, 1, 2, 4] - store = OneHotStore(indexes=indexes, values=values, sequence=sequence) - sub_store = store[0:3] - assert sub_store.shape == (3, 3) - assert [ - sub_store.sequence[i] == [0, 1, 2, 3, 0, 1, 2, 3][i] for i in range(len(sub_store.sequence)) - ] - assert sub_store.store == { - 0: 0, - 1: 1, - 2: 2, - } +def test_onehotstore_instantiation_from_ids(): + """Test the instantiation; from_sequence of OneHotStore.""" + ids = [0, 1, 2, 3, 4] + storage = OneHotStorage(ids=ids, name="OneHotTest") + assert ( + storage.batch[[0, 2, 4]] == np.array([[1, 0, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]]) + ).all() + assert storage.storage == {0: 0, 1: 1, 2: 2, 3: 3, 4: 4} -def test_onehotstore_batch(): - """Test the getitem of OneHotStore.""" - indexes = [0, 1, 2, 4] - values = [0, 1, 2, 3] - sequence = [0, 1, 2, 4, 0, 1, 2, 4] - store = OneHotStore(indexes=indexes, values=values, sequence=sequence) - - batch = store.batch[0] - assert (batch == [1, 0, 0, 0]).all() +def test_onehotstore_instantiation_from_dict(): + """Test the instantiation; from_sequence of OneHotStore.""" + ids = [0, 1, 2, 3, 4] + values = [4, 3, 2, 1, 0] + values_dict = {k: v for k, v in zip(ids, values)} + storage = OneHotStorage(values=values_dict, name="OneHotTest") + assert ( + storage.batch[[0, 2, 4]] == np.array([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [1, 0, 0, 0, 0]]) + ).all() + assert storage.storage == {4: 0, 3: 1, 2: 2, 1: 3, 0: 4} - batch = store.batch[0:4] - assert (batch == [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]).all() - batch = store.batch[[3, 6, 7]] - assert (batch == [[0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 0, 1]]).all() +def test_onehotstore_getitem(): + """Test the getitem of OneHotStore.""" + ids = [0, 1, 2, 3, 4] + values = [4, 3, 2, 1, 0] + storage = OneHotStorage(ids=ids, values=values, name="OneHotTest") + assert ( + storage.batch[[0, 2, 4]] == np.array([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [1, 0, 0, 0, 0]]) + ).all() + assert storage.get_element_from_index(0) == 4 + + +def test_fail_instantiation(): + """Testing failed instantiation.""" + try: + _ = OneHotStorage(name="OneHotTest") + assert False + except ValueError: + assert True From dc6fdf3a397f21756386d61c09bd1b263bebde34 Mon Sep 17 00:00:00 2001 From: VincentAuriau Date: Mon, 15 Jan 2024 15:59:06 +0100 Subject: [PATCH 22/22] FIX: mkdocs after wrong merge --- mkdocs.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index b9054fe2..93d3cd97 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -36,9 +36,10 @@ markdown_extensions: - pymdownx.superfences - mdx_math -plugins: -- extra_javascript: +extra_javascript: - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-AMS-MML_HTMLorMML + +plugins: - mkdocstrings: handlers: python: