From b51cfe76db40c5267e1cd0d5e339ccc961cb9c20 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Wed, 1 Nov 2023 09:02:25 +0100 Subject: [PATCH] Support dask, cupy, and numpy array types --- .github/workflows/ci.yml | 4 +- dask_glm/algorithms.py | 7 +-- dask_glm/tests/test_algos_families.py | 66 +++++++++++++++------------ dask_glm/utils.py | 9 +++- 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20d0b75..852d6c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,5 +31,5 @@ jobs: - name: Run pytest shell: bash -l {0} run: | - pip install pytest - pytest dask_glm + pip install pytest pytest-xdist + pytest dask_glm -n auto diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 8a6d15d..21e02f0 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -20,6 +20,7 @@ maybe_to_cupy, normalize, scatter_array, + zeros, ) @@ -115,7 +116,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult - beta = np.zeros_like(X._meta, shape=p) + beta = zeros(shape=p, arr=getattr(X, "_meta", None)) for k in range(max_iter): # how necessary is this recalculation? @@ -188,7 +189,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs): """ gradient, hessian = family.gradient, family.hessian n, p = X.shape - beta = np.zeros_like(X._meta, shape=p) + beta = zeros(shape=p, arr=getattr(X, "_meta", None)) Xbeta = dot(X, beta) iter_count = 0 @@ -457,7 +458,7 @@ def proximal_grad( stepSize = 1.0 recalcRate = 10 backtrackMult = firstBacktrackMult - beta = np.zeros_like(X._meta, shape=p) + beta = zeros(shape=p, arr=getattr(X, "_meta", None)) regularizer = Regularizer.get(regularizer) for k in range(max_iter): diff --git a/dask_glm/tests/test_algos_families.py b/dask_glm/tests/test_algos_families.py index 5d14f53..672f136 100644 --- a/dask_glm/tests/test_algos_families.py +++ b/dask_glm/tests/test_algos_families.py @@ -37,22 +37,34 @@ def make_intercept_data(N, p, seed=20009): return X, y +def make_array_type(type, *arrays): + if type == "cupy": + cupy = pytest.importorskip("cupy") + return to_dask_cupy_array_xy(*arrays, cupy) + elif type == "numpy": + return tuple(a.compute() if hasattr(a, "compute") else a for a in arrays) + else: + return arrays + + +def maybe_compute(expr): + return expr.compute() if hasattr(expr, "compute") else expr + + @pytest.mark.parametrize("opt", [lbfgs, newton, gradient_descent]) @pytest.mark.parametrize( "N, p, seed,", [(100, 2, 20009), (250, 12, 90210), (95, 6, 70605)] ) -@pytest.mark.parametrize("is_cupy", [True, False]) -def test_methods(N, p, seed, opt, is_cupy): +@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"]) +def test_methods(N, p, seed, opt, array_type): X, y = make_intercept_data(N, p, seed=seed) - - if is_cupy: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) + X, y = make_array_type(array_type, X, y) coefs = opt(X, y) - p = sigmoid(X.dot(coefs).compute()) + dot = maybe_compute(X.dot(coefs)) + p = sigmoid(dot) - y_sum = y.compute().sum() + y_sum = maybe_compute(y).sum() p_sum = p.sum() assert np.isclose(y_sum, p_sum, atol=1e-1) @@ -68,25 +80,24 @@ def test_methods(N, p, seed, opt, is_cupy): @pytest.mark.parametrize("N", [1000]) @pytest.mark.parametrize("nchunks", [1, 10]) @pytest.mark.parametrize("family", [Logistic, Normal, Poisson]) -@pytest.mark.parametrize("is_cupy", [True, False]) -def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy): +@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"]) +def test_basic_unreg_descent(func, kwargs, N, nchunks, family, array_type): beta = np.random.normal(size=2) M = len(beta) X = da.random.random((N, M), chunks=(N // nchunks, M)) y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,)) - if is_cupy: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) + X, y = make_array_type(array_type, X, y) - X, y = persist(X, y) + if array_type != "numpy": + X, y = persist(X, y) result = func(X, y, family=family, **kwargs) test_vec = np.random.normal(size=2) test_vec = maybe_to_cupy(test_vec, X) - opt = family.pointwise_loss(result, X, y).compute() - test_val = family.pointwise_loss(test_vec, X, y).compute() + opt = maybe_compute(family.pointwise_loss(result, X, y)) + test_val = maybe_compute(family.pointwise_loss(test_vec, X, y)) assert opt < test_val @@ -103,18 +114,17 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy): @pytest.mark.parametrize("family", [Logistic, Normal, Poisson]) @pytest.mark.parametrize("lam", [0.01, 1.2, 4.05]) @pytest.mark.parametrize("reg", [r() for r in Regularizer.__subclasses__()]) -@pytest.mark.parametrize("is_cupy", [True, False]) -def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy): +@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"]) +def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, array_type): beta = np.random.normal(size=2) M = len(beta) X = da.random.random((N, M), chunks=(N // nchunks, M)) y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,)) - if is_cupy: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) + X, y = make_array_type(array_type, X, y) - X, y = persist(X, y) + if array_type != "numpy": + X, y = persist(X, y) result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs) test_vec = np.random.normal(size=2) @@ -122,8 +132,8 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy): f = reg.add_reg_f(family.pointwise_loss, lam) - opt = f(result, X, y).compute() - test_val = f(test_vec, X, y).compute() + opt = maybe_compute(f(result, X, y)) + test_val = maybe_compute(f(test_vec, X, y)) assert opt < test_val @@ -138,12 +148,10 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy): ], ) @pytest.mark.parametrize("scheduler", ["synchronous", "threading", "multiprocessing"]) -@pytest.mark.parametrize("is_cupy", [True, False]) -def test_determinism(func, kwargs, scheduler, is_cupy): +@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"]) +def test_determinism(func, kwargs, scheduler, array_type): X, y = make_intercept_data(1000, 10) - if is_cupy: - cupy = pytest.importorskip("cupy") - X, y = to_dask_cupy_array_xy(X, y, cupy) + X, y = make_array_type(array_type, X, y) with dask.config.set(scheduler=scheduler): a = func(X, y, **kwargs) diff --git a/dask_glm/utils.py b/dask_glm/utils.py index 1bdd8f7..88a4a14 100644 --- a/dask_glm/utils.py +++ b/dask_glm/utils.py @@ -26,7 +26,7 @@ def normalize_inputs(X, y, *args, **kwargs): mean = ( mean if len(intercept_idx[0]) - else np.zeros_like(X._meta, shape=mean.shape) + else zeros(mean.shape, getattr(X, "_meta", None)) ) Xn = (X - mean) / std out = algo(Xn, y, *args, **kwargs).copy() @@ -39,6 +39,13 @@ def normalize_inputs(X, y, *args, **kwargs): return normalize_inputs +def zeros(shape, arr=None, **kwargs): + if arr: + return np.zeros_like(arr, shape=shape, **kwargs) + else: + return np.zeros(shape=shape, **kwargs) + + def sigmoid(x): """Sigmoid function of x.""" return 1 / (1 + exp(-x))