Merge pull request #33 from artefactory/optim

Additions: - TaFeng Dataset - AssortmentOptimizer class based on Gurobi - An example on the TaFeng Dataset Fixes: - Optimized NLL - some typos
artefactory · Mar 5, 2024 · 4fbf727 · 4fbf727
2 parents f079757 + 257f9b3
commit 4fbf727
Show file tree

Hide file tree

Showing 6 changed files with 585 additions and 1 deletion.
diff --git a/choice_learn/datasets/data/ta_feng.csv.zip b/choice_learn/datasets/data/ta_feng.csv.zip
diff --git a/choice_learn/datasets/examples.py b/choice_learn/datasets/examples.py
@@ -0,0 +1,117 @@
+"""Some datasets used for personal examples."""
+import os
+
+import numpy as np
+import pandas as pd
+
+from choice_learn.data.choice_dataset import ChoiceDataset
+
+DATA_MODULE = os.path.join(os.path.abspath(".."), "choice_learn", "datasets", "data")
+
+
+def load_tafeng(as_frame=False, preprocessing=None):
+    """Function to load the TaFeng dataset.
+
+    Orginal file and informations can be found here:
+    https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset/
+
+    Parameters
+    ----------
+    as_frame : bool, optional
+        Whether to return the original file as pd.DF, by default False
+    preprocessing : str, optional
+        predefined pre-processing to apply, by default None
+
+    Returns:
+    --------
+    pd.DF or ChoiceDataset
+        TaFeng Grocery Dataset.
+    """
+    filepath = os.path.join(DATA_MODULE, "ta_feng.csv.zip")
+    # url = "https://www.kaggle.com/datasets/chiranjivdas09/ta-feng-grocery-dataset/download?datasetVersionNumber=1"
+    # if not os.path.exists(filepath):
+    #     with urllib.request.urlopen(url) as f:
+    #         file = f.read().decode("utf-8")
+
+    tafeng_df = pd.read_csv(filepath)
+    if as_frame:
+        return tafeng_df
+
+    if preprocessing == "assort_example":
+        subdf = tafeng_df.loc[tafeng_df.PRODUCT_SUBCLASS == 100505]
+        prods = subdf.PRODUCT_ID.value_counts().index[
+            (subdf.PRODUCT_ID.value_counts() > 20).to_numpy()
+        ]
+        subdf = tafeng_df.loc[tafeng_df.PRODUCT_ID.isin(prods)]
+        subdf = subdf.dropna()
+        subdf = subdf.reset_index(drop=True)
+
+        # Create Prices
+        items = list(subdf.PRODUCT_ID.unique())
+        init_prices = []
+        for item in items:
+            first_price = subdf.loc[subdf.PRODUCT_ID == item].SALES_PRICE.to_numpy()[0]
+            init_prices.append(first_price)
+
+        # Encode Age Groups
+        age_groups = {}
+        for i, j in enumerate(subdf.AGE_GROUP.unique()):
+            age_groups[j] = i
+        age_groups = {
+            "<25": 0,
+            "25-29": 0,
+            "30-34": 0,
+            "35-39": 1,
+            "40-44": 1,
+            "45-49": 1,
+            "50-54": 2,
+            "55-59": 2,
+            "60-64": 2,
+            ">65": 2,
+        }
+        age_groups = {
+            "<25": [1, 0, 0],
+            "25-29": [0, 1, 0],
+            "30-34": [0, 1, 0],
+            "35-39": [0, 1, 0],
+            "40-44": [0, 1, 0],
+            "45-49": [0, 1, 0],
+            "50-54": [0, 0, 1],
+            "55-59": [0, 0, 1],
+            "60-64": [0, 0, 1],
+            ">65": [0, 0, 1],
+        }
+
+        all_prices = []
+        customer_features = []
+        choices = []
+
+        curr_prices = [i for i in init_prices]
+
+        for n_row, row in subdf.iterrows():
+            for _ in range(int(row.AMOUNT)):
+                item = row.PRODUCT_ID
+                price = row.SALES_PRICE / row.AMOUNT
+                age = row.AGE_GROUP
+
+                item_index = items.index(item)
+
+                # customer_features.append([age_groups[age]])
+                customer_features.append(age_groups[age])
+                choices.append(item_index)
+                curr_prices[item_index] = price
+                all_prices.append([i for i in curr_prices])
+
+        all_prices = np.expand_dims(np.array(all_prices), axis=-1)
+        customer_features = np.array(customer_features).astype("float32")
+        choices = np.array(choices)
+
+        # Create Dataset
+        return ChoiceDataset(
+            contexts_features=customer_features,
+            choices=choices,
+            contexts_items_features=all_prices,
+            contexts_items_availabilities=np.ones((len(choices), 25)).astype("float32"),
+        )
+
+    return tafeng_df
diff --git a/choice_learn/models/base_model.py b/choice_learn/models/base_model.py
@@ -1299,6 +1299,7 @@ def _em_fit(self, dataset, verbose=0):
         """
         hist_logits = []
         hist_loss = []
+
         # Initialization
         for model in self.models:
             # model.instantiate()

diff --git a/choice_learn/toolbox/assortment_optimizer.py b/choice_learn/toolbox/assortment_optimizer.py
@@ -0,0 +1,135 @@
+"""Tool function for assortment optimization."""
+import gurobipy as gp
+import numpy as np
+
+"""TODO: clarify outside good integration
+TODO 2: ADD easy integration of additionnal constraints
+"""
+
+
+class AssortmentOptimizer(object):
+    """Base class for assortment optimization."""
+
+    def __init__(self, utilities, itemwise_values, assortment_size, outside_option_given=False):
+        """Initializes the AssortmentOptimizer object.
+
+        Parameters
+        ----------
+        utilities : Iterable
+            List of utilities for each item.
+        itemwise_values: Iterable
+            List of to-be-optimized values for each item, e.g. prices.
+        assortment_size : int
+            maximum size of the requested assortment.
+        outside_option_given : bool
+            Whether the outside option is given or not (and thus is automatically added).
+        """
+        if len(utilities) != len(itemwise_values):
+            raise ValueError(
+                f"You should provide as many utilities as itemwise values.\
+                             Found {len(utilities)} and {len(itemwise_values)} instead."
+            )
+        self.outside_option_given = outside_option_given
+        if not self.outside_option_given:
+            self.utilities = np.concatenate([[np.exp(0.0)], utilities], axis=0)
+            self.itemwise_values = np.concatenate([[0.0], itemwise_values], axis=0)
+        self.n_items = len(self.utilities) - 1
+        self.assortment_size = assortment_size
+
+        self.solver = self.base_instantiate()
+        self.set_base_constraints()
+
+    def base_instantiate(self):
+        """Base instantiation of the solver.
+
+        Returns:
+        --------
+        gurobipy.Model
+            solver with basic variables and constraints.
+        """
+        # Create a new model
+        solver = gp.Model("Assortment_IP")
+        solver.ModelSense = -1
+        solver.setParam("OutputFlag", False)
+
+        # Create variables
+        y = {}
+
+        for j in range(self.n_items + 1):
+            y[j] = solver.addVar(
+                vtype=gp.GRB.CONTINUOUS, obj=self.itemwise_values[j], name="y_%s" % j
+            )
+        self.y = y
+        # Integrate new variables
+        solver.update()
+
+        return solver
+
+    def set_base_constraints(self):
+        """Functions to set LP base constraints.
+
+        In particular, ensures Charnes-Cooper transformation constraints
+        and assortment size constraint.
+        """
+        # Base Charnes-Cooper Constraints for Integers
+        for j in range(1, self.n_items + 1):
+            self.solver.addConstr(self.y[j] <= self.y[0])
+
+        # Base Charnes-Cooper Constraint for Normalization
+        charnes_cooper = gp.quicksum(self.y[j] for j in range(self.n_items + 1))
+        self.solver.addConstr(charnes_cooper == 1)
+
+        # Assortment size constraint
+        if self.assortment_size is not None:
+            self.solver.addConstr(
+                gp.quicksum([self.y[j] for j in range(1, self.n_items)])
+                <= self.assortment_size * self.y[0]
+            )
+            self.solver.addConstr(
+                gp.quicksum([-self.y[j] for j in range(1, self.n_items)])
+                <= -self.assortment_size * self.y[0]
+            )
+
+        # Integrate constraints
+        self.solver.update()
+
+    def set_objective_function(self, itemwise_values):
+        """Function to define the objective function to maximize with the assortment.
+
+        Parameters:
+        -----------
+        itemwise_values : list-like
+            List of values for each item - total value to be optimized.
+        """
+        raise NotImplementedError
+
+    def add_constraint(self):
+        """Function to add constraints."""
+        raise NotImplementedError
+
+    def solve(self):
+        """Function to solve the optimization problem.
+
+        Returns:
+        --------
+        np.ndarray:
+            Array of 0s and 1s, indicating the presence of each item in the optimal assortment.
+        """
+        self.solver.update()
+
+        # -- Optimize --
+        self.solver.optimize()
+        self.status = self.solver.Status
+
+        if self.outside_option_given:
+            assortment = np.zeros(self.n_items + 1)
+            for i in range(0, self.n_items + 1):
+                if self.y[i].x > 0:
+                    assortment[i - 1] = 1
+        else:
+            assortment = np.zeros(self.n_items)
+            for i in range(1, self.n_items + 1):
+                if self.y[i].x > 0:
+                    assortment[i] = 1
+
+        return assortment, self.solver.objVal