Skip to content

Commit

Permalink
Using get_path functionality + global variables (#13)
Browse files Browse the repository at this point in the history
get_path use for all datasets
  • Loading branch information
Jad-yehya authored Aug 6, 2024
1 parent 3aad2b2 commit 61f4be4
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 55 deletions.
37 changes: 16 additions & 21 deletions datasets/msl.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
from benchopt import BaseDataset, safe_import_context
from benchopt import BaseDataset, safe_import_context, config

with safe_import_context() as import_ctx:
import os
import pathlib
import numpy as np
import requests

# Create global variables to store the urls
url_xtrain = (
URL_XTRAIN = (
"https://drive.google.com/uc?&id="
"1PMzjODVFblVnwq8xo7pKHrdbczPxdqTa&export=download"
)

url_xtest = (
URL_XTEST = (
"https://drive.google.com/uc?&id="
"1OcNc0YQsOMw9jQIIHgiOXVG03wjXbEiM&export=download"
)

url_ytest = (
URL_YTEST = (
"https://drive.google.com/uc?&id="
"19vR0QvKluuiIT2H5mCFNIJh6xGVwshDd&export=download"
)
Expand All @@ -33,27 +33,22 @@ class Dataset(BaseDataset):
}

def get_data(self):
# Adding get_data_path method soon

path = config.get_data_path(key="MSL")
# Check if the data is already here
if not os.path.exists("data/MSL/MSL_train.npy"):
os.makedirs("data/MSL", exist_ok=True)

response = requests.get(url_xtrain)
with open("data/MSL/MSL_train.npy", "wb") as f:
if not pathlib.Path.exists(path):
response = requests.get(URL_XTRAIN)
with open(pathlib.Path(path) / "MSL_train.npy", "wb") as f:
f.write(response.content)

response = requests.get(url_xtest)
with open("data/MSL/MSL_test.npy", "wb") as f:
response = requests.get(URL_XTEST)
with open(pathlib.Path(path) / "MSL_test.npy", "wb") as f:
f.write(response.content)

response = requests.get(url_ytest)
with open("data/MSL/MSL_test_label.npy", "wb") as f:
response = requests.get(URL_YTEST)
with open(pathlib.Path(path) / "MSL_test_label.npy", "wb") as f:
f.write(response.content)

X_train = np.load("data/MSL/MSL_train.npy")
X_test = np.load("data/MSL/MSL_test.npy")
y_test = np.load("data/MSL/MSL_test_label.npy")
X_train = np.load(path / "MSL_train.npy")
X_test = np.load(path / "MSL_test.npy")
y_test = np.load(path / "MSL_test_label.npy")

# Limiting the size of the dataset for testing purposes
if self.debug:
Expand Down
35 changes: 18 additions & 17 deletions datasets/psm.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from benchopt import BaseDataset, safe_import_context
from benchopt import BaseDataset, safe_import_context, config

with safe_import_context() as import_ctx:
import os
import requests
import pandas as pd
import pathlib

url_xtrain = (
URL_XTRAIN = (
"https://drive.google.com/uc?&id=1d3tAbYTj0CZLhB7z3IDTfTRg3E7qj_tw"
"&export=download"
)
url_xtest = (
URL_XTEST = (
"https://drive.google.com/uc?&id=1RQH7igHhm_0GAgXyVpkJk6TenDl9rd53"
"&export=download"
)
url_ytest = (
URL_YTEST = (
"https://drive.google.com/uc?&id=1SYgcRt0DH--byFbvkKTkezJKU5ZENZhw"
"&export=download"
)
Expand All @@ -29,27 +29,28 @@ class Dataset(BaseDataset):

def get_data(self):
# Check if the data is already here
if not os.path.exists("data/PSM/PSM_train.csv"):
os.makedirs("data/PSM", exist_ok=True)
response = requests.get(url_xtrain)
with open("data/PSM/PSM_train.csv", "wb") as f:
path = config.get_data_path(key="PSM")

if not pathlib.Path.exists(path):
response = requests.get(URL_XTRAIN)
with open(path / "PSM_train.csv", "wb") as f:
f.write(response.content)
response = requests.get(url_xtest)
with open("data/PSM/PSM_test.csv", "wb") as f:
response = requests.get(URL_XTEST)
with open(path / "PSM_test.csv", "wb") as f:
f.write(response.content)
response = requests.get(url_ytest)
with open("data/PSM/PSM_test_label.csv", "wb") as f:
response = requests.get(URL_YTEST)
with open(path / "PSM_test_label.csv", "wb") as f:
f.write(response.content)

X_train = pd.read_csv("data/PSM/PSM_train.csv")
X_train = pd.read_csv(path / "PSM_train.csv")
X_train.fillna(X_train.mean(), inplace=True)
X_train = X_train.to_numpy()

X_test = pd.read_csv("data/PSM/PSM_test.csv")
X_test.fillna(X_test.mean(), inplace=True)
X_test = pd.read_csv(path / "PSM_test.csv")
X_test.fillna(X_train.mean(), inplace=True)
X_test = X_test.to_numpy()

y_test = pd.read_csv("data/PSM/PSM_test_label.csv").to_numpy()[:, 1]
y_test = pd.read_csv(path / "PSM_test_label.csv").to_numpy()[:, 1]

# Limiting the size of the dataset for testing purposes
if self.debug:
Expand Down
37 changes: 20 additions & 17 deletions datasets/smap.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from benchopt import BaseDataset, safe_import_context
from benchopt import BaseDataset, safe_import_context, config

with safe_import_context() as import_ctx:
import os
import pathlib
import numpy as np
import requests
# from sklearn.model_selection import TimeSeriesSplit

url_xtrain = (
URL_XTRAIN = (
"https://drive.google.com/uc?&id=1e_JhpIURD"
"Lluw4IcHJF-dgtjjJXsPEKE&export=download"
)
url_xtest = (
URL_XTEST = (
"https://drive.google.com/uc?&id=10"
"-r-Zm0nfQJp0i-mVg3iXs6x0u9Ua25a&export=download"
)
url_ytest = (
URL_YTEST = (
"https://drive.google.com/uc?&id=1uYiXqmK3C"
"gyxk4U6-LgUni7JddQnlggs&export=download"
)
Expand All @@ -23,33 +24,35 @@ class Dataset(BaseDataset):
name = "SMAP"

install_cmd = "conda"
requirements = ["pandas"]
requirements = ["pandas", "scikit-learn"]

parameters = {
"debug": [False],
"n_splits": [5],
"validation_size": [0.2],
}

def get_data(self):
path = config.get_data_path(key="SMAP")

# Check if the data is already here
if not os.path.exists("data/SMAP/SMAP_train.npy"):
os.makedirs("data/SMAP", exist_ok=True)
if not pathlib.Path.exists(path):

response = requests.get(url_xtrain)
with open("data/SMAP/SMAP_train.npy", "wb") as f:
response = requests.get(URL_XTRAIN)
with open(pathlib.Path(path) / "SMAP_train.npy", "wb") as f:
f.write(response.content)

response = requests.get(url_xtest)
with open("data/SMAP/SMAP_test.npy", "wb") as f:
response = requests.get(URL_XTEST)
with open(pathlib.Path(path) / "SMAP_test.npy", "wb") as f:
f.write(response.content)

response = requests.get(url_ytest)
with open("data/SMAP/SMAP_test_label.npy", "wb") as f:
response = requests.get(URL_YTEST)
with open(pathlib.Path(path) / "SMAP_test_label.npy", "wb") as f:
f.write(response.content)

X_train = np.load("data/SMAP/SMAP_train.npy")
X_test = np.load("data/SMAP/SMAP_test.npy")
y_test = np.load("data/SMAP/SMAP_test_label.npy")
X_train = np.load(path / "SMAP_train.npy")
X_test = np.load(path / "SMAP_test.npy")
y_test = np.load(path / "SMAP_test_label.npy")

# Limiting the size of the dataset for testing purposes
if self.debug:
Expand Down

0 comments on commit 61f4be4

Please sign in to comment.