Using get_path functionality + global variables (#13)

get_path use for all datasets
benchopt · Aug 6, 2024 · 61f4be4 · 61f4be4
1 parent 3aad2b2
commit 61f4be4
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 55 deletions.
diff --git a/datasets/msl.py b/datasets/msl.py
@@ -1,22 +1,22 @@
-from benchopt import BaseDataset, safe_import_context
+from benchopt import BaseDataset, safe_import_context, config
 
 with safe_import_context() as import_ctx:
-    import os
+    import pathlib
     import numpy as np
     import requests
 
 # Create global variables to store the urls
-url_xtrain = (
+URL_XTRAIN = (
     "https://drive.google.com/uc?&id="
     "1PMzjODVFblVnwq8xo7pKHrdbczPxdqTa&export=download"
 )
 
-url_xtest = (
+URL_XTEST = (
     "https://drive.google.com/uc?&id="
     "1OcNc0YQsOMw9jQIIHgiOXVG03wjXbEiM&export=download"
 )
 
-url_ytest = (
+URL_YTEST = (
     "https://drive.google.com/uc?&id="
     "19vR0QvKluuiIT2H5mCFNIJh6xGVwshDd&export=download"
 )
@@ -33,27 +33,22 @@ class Dataset(BaseDataset):
     }
 
     def get_data(self):
-        # Adding get_data_path method soon
-
+        path = config.get_data_path(key="MSL")
         # Check if the data is already here
-        if not os.path.exists("data/MSL/MSL_train.npy"):
-            os.makedirs("data/MSL", exist_ok=True)
-
-            response = requests.get(url_xtrain)
-            with open("data/MSL/MSL_train.npy", "wb") as f:
+        if not pathlib.Path.exists(path):
+            response = requests.get(URL_XTRAIN)
+            with open(pathlib.Path(path) / "MSL_train.npy", "wb") as f:
                 f.write(response.content)
-
-            response = requests.get(url_xtest)
-            with open("data/MSL/MSL_test.npy", "wb") as f:
+            response = requests.get(URL_XTEST)
+            with open(pathlib.Path(path) / "MSL_test.npy", "wb") as f:
                 f.write(response.content)
-
-            response = requests.get(url_ytest)
-            with open("data/MSL/MSL_test_label.npy", "wb") as f:
+            response = requests.get(URL_YTEST)
+            with open(pathlib.Path(path) / "MSL_test_label.npy", "wb") as f:
                 f.write(response.content)
 
-        X_train = np.load("data/MSL/MSL_train.npy")
-        X_test = np.load("data/MSL/MSL_test.npy")
-        y_test = np.load("data/MSL/MSL_test_label.npy")
+        X_train = np.load(path / "MSL_train.npy")
+        X_test = np.load(path / "MSL_test.npy")
+        y_test = np.load(path / "MSL_test_label.npy")
 
         # Limiting the size of the dataset for testing purposes
         if self.debug:

diff --git a/datasets/psm.py b/datasets/psm.py
@@ -1,19 +1,19 @@
-from benchopt import BaseDataset, safe_import_context
+from benchopt import BaseDataset, safe_import_context, config
 
 with safe_import_context() as import_ctx:
-    import os
     import requests
     import pandas as pd
+    import pathlib
 
-url_xtrain = (
+URL_XTRAIN = (
     "https://drive.google.com/uc?&id=1d3tAbYTj0CZLhB7z3IDTfTRg3E7qj_tw"
     "&export=download"
 )
-url_xtest = (
+URL_XTEST = (
     "https://drive.google.com/uc?&id=1RQH7igHhm_0GAgXyVpkJk6TenDl9rd53"
     "&export=download"
 )
-url_ytest = (
+URL_YTEST = (
     "https://drive.google.com/uc?&id=1SYgcRt0DH--byFbvkKTkezJKU5ZENZhw"
     "&export=download"
 )
@@ -29,27 +29,28 @@ class Dataset(BaseDataset):
 
     def get_data(self):
         # Check if the data is already here
-        if not os.path.exists("data/PSM/PSM_train.csv"):
-            os.makedirs("data/PSM", exist_ok=True)
-            response = requests.get(url_xtrain)
-            with open("data/PSM/PSM_train.csv", "wb") as f:
+        path = config.get_data_path(key="PSM")
+
+        if not pathlib.Path.exists(path):
+            response = requests.get(URL_XTRAIN)
+            with open(path / "PSM_train.csv", "wb") as f:
                 f.write(response.content)
-            response = requests.get(url_xtest)
-            with open("data/PSM/PSM_test.csv", "wb") as f:
+            response = requests.get(URL_XTEST)
+            with open(path / "PSM_test.csv", "wb") as f:
                 f.write(response.content)
-            response = requests.get(url_ytest)
-            with open("data/PSM/PSM_test_label.csv", "wb") as f:
+            response = requests.get(URL_YTEST)
+            with open(path / "PSM_test_label.csv", "wb") as f:
                 f.write(response.content)
 
-        X_train = pd.read_csv("data/PSM/PSM_train.csv")
+        X_train = pd.read_csv(path / "PSM_train.csv")
         X_train.fillna(X_train.mean(), inplace=True)
         X_train = X_train.to_numpy()
 
-        X_test = pd.read_csv("data/PSM/PSM_test.csv")
-        X_test.fillna(X_test.mean(), inplace=True)
+        X_test = pd.read_csv(path / "PSM_test.csv")
+        X_test.fillna(X_train.mean(), inplace=True)
         X_test = X_test.to_numpy()
 
-        y_test = pd.read_csv("data/PSM/PSM_test_label.csv").to_numpy()[:, 1]
+        y_test = pd.read_csv(path / "PSM_test_label.csv").to_numpy()[:, 1]
 
         # Limiting the size of the dataset for testing purposes
         if self.debug:

diff --git a/datasets/smap.py b/datasets/smap.py
@@ -1,19 +1,20 @@
-from benchopt import BaseDataset, safe_import_context
+from benchopt import BaseDataset, safe_import_context, config
 
 with safe_import_context() as import_ctx:
-    import os
+    import pathlib
     import numpy as np
     import requests
+    # from sklearn.model_selection import TimeSeriesSplit
 
-url_xtrain = (
+URL_XTRAIN = (
     "https://drive.google.com/uc?&id=1e_JhpIURD"
     "Lluw4IcHJF-dgtjjJXsPEKE&export=download"
 )
-url_xtest = (
+URL_XTEST = (
     "https://drive.google.com/uc?&id=10"
     "-r-Zm0nfQJp0i-mVg3iXs6x0u9Ua25a&export=download"
 )
-url_ytest = (
+URL_YTEST = (
     "https://drive.google.com/uc?&id=1uYiXqmK3C"
     "gyxk4U6-LgUni7JddQnlggs&export=download"
 )
@@ -23,33 +24,35 @@ class Dataset(BaseDataset):
     name = "SMAP"
 
     install_cmd = "conda"
-    requirements = ["pandas"]
+    requirements = ["pandas", "scikit-learn"]
 
     parameters = {
         "debug": [False],
+        "n_splits": [5],
+        "validation_size": [0.2],
     }
 
     def get_data(self):
+        path = config.get_data_path(key="SMAP")
 
         # Check if the data is already here
-        if not os.path.exists("data/SMAP/SMAP_train.npy"):
-            os.makedirs("data/SMAP", exist_ok=True)
+        if not pathlib.Path.exists(path):
 
-            response = requests.get(url_xtrain)
-            with open("data/SMAP/SMAP_train.npy", "wb") as f:
+            response = requests.get(URL_XTRAIN)
+            with open(pathlib.Path(path) / "SMAP_train.npy", "wb") as f:
                 f.write(response.content)
 
-            response = requests.get(url_xtest)
-            with open("data/SMAP/SMAP_test.npy", "wb") as f:
+            response = requests.get(URL_XTEST)
+            with open(pathlib.Path(path) / "SMAP_test.npy", "wb") as f:
                 f.write(response.content)
 
-            response = requests.get(url_ytest)
-            with open("data/SMAP/SMAP_test_label.npy", "wb") as f:
+            response = requests.get(URL_YTEST)
+            with open(pathlib.Path(path) / "SMAP_test_label.npy", "wb") as f:
                 f.write(response.content)
 
-        X_train = np.load("data/SMAP/SMAP_train.npy")
-        X_test = np.load("data/SMAP/SMAP_test.npy")
-        y_test = np.load("data/SMAP/SMAP_test_label.npy")
+        X_train = np.load(path / "SMAP_train.npy")
+        X_test = np.load(path / "SMAP_test.npy")
+        y_test = np.load(path / "SMAP_test_label.npy")
 
         # Limiting the size of the dataset for testing purposes
         if self.debug: