Skip to content

Commit

Permalink
Merge pull request #105 from milesgranger/milesgranger/fix-104-non-da…
Browse files Browse the repository at this point in the history
…sk-data-compat

Support dask, cupy, and numpy array types
  • Loading branch information
milesgranger authored Nov 6, 2023
2 parents 5194d6f + b51cfe7 commit 7ee6764
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 35 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ jobs:
- name: Run pytest
shell: bash -l {0}
run: |
pip install pytest
pytest dask_glm
pip install pytest pytest-xdist
pytest dask_glm -n auto
7 changes: 4 additions & 3 deletions dask_glm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
maybe_to_cupy,
normalize,
scatter_array,
zeros,
)


Expand Down Expand Up @@ -115,7 +116,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros_like(X._meta, shape=p)
beta = zeros(shape=p, arr=getattr(X, "_meta", None))

for k in range(max_iter):
# how necessary is this recalculation?
Expand Down Expand Up @@ -188,7 +189,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
"""
gradient, hessian = family.gradient, family.hessian
n, p = X.shape
beta = np.zeros_like(X._meta, shape=p)
beta = zeros(shape=p, arr=getattr(X, "_meta", None))
Xbeta = dot(X, beta)

iter_count = 0
Expand Down Expand Up @@ -457,7 +458,7 @@ def proximal_grad(
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros_like(X._meta, shape=p)
beta = zeros(shape=p, arr=getattr(X, "_meta", None))
regularizer = Regularizer.get(regularizer)

for k in range(max_iter):
Expand Down
66 changes: 37 additions & 29 deletions dask_glm/tests/test_algos_families.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,34 @@ def make_intercept_data(N, p, seed=20009):
return X, y


def make_array_type(type, *arrays):
if type == "cupy":
cupy = pytest.importorskip("cupy")
return to_dask_cupy_array_xy(*arrays, cupy)
elif type == "numpy":
return tuple(a.compute() if hasattr(a, "compute") else a for a in arrays)
else:
return arrays


def maybe_compute(expr):
return expr.compute() if hasattr(expr, "compute") else expr


@pytest.mark.parametrize("opt", [lbfgs, newton, gradient_descent])
@pytest.mark.parametrize(
"N, p, seed,", [(100, 2, 20009), (250, 12, 90210), (95, 6, 70605)]
)
@pytest.mark.parametrize("is_cupy", [True, False])
def test_methods(N, p, seed, opt, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_methods(N, p, seed, opt, array_type):
X, y = make_intercept_data(N, p, seed=seed)

if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

coefs = opt(X, y)
p = sigmoid(X.dot(coefs).compute())
dot = maybe_compute(X.dot(coefs))
p = sigmoid(dot)

y_sum = y.compute().sum()
y_sum = maybe_compute(y).sum()
p_sum = p.sum()
assert np.isclose(y_sum, p_sum, atol=1e-1)

Expand All @@ -68,25 +80,24 @@ def test_methods(N, p, seed, opt, is_cupy):
@pytest.mark.parametrize("N", [1000])
@pytest.mark.parametrize("nchunks", [1, 10])
@pytest.mark.parametrize("family", [Logistic, Normal, Poisson])
@pytest.mark.parametrize("is_cupy", [True, False])
def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_basic_unreg_descent(func, kwargs, N, nchunks, family, array_type):
beta = np.random.normal(size=2)
M = len(beta)
X = da.random.random((N, M), chunks=(N // nchunks, M))
y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,))

if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

X, y = persist(X, y)
if array_type != "numpy":
X, y = persist(X, y)

result = func(X, y, family=family, **kwargs)
test_vec = np.random.normal(size=2)
test_vec = maybe_to_cupy(test_vec, X)

opt = family.pointwise_loss(result, X, y).compute()
test_val = family.pointwise_loss(test_vec, X, y).compute()
opt = maybe_compute(family.pointwise_loss(result, X, y))
test_val = maybe_compute(family.pointwise_loss(test_vec, X, y))

assert opt < test_val

Expand All @@ -103,27 +114,26 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy):
@pytest.mark.parametrize("family", [Logistic, Normal, Poisson])
@pytest.mark.parametrize("lam", [0.01, 1.2, 4.05])
@pytest.mark.parametrize("reg", [r() for r in Regularizer.__subclasses__()])
@pytest.mark.parametrize("is_cupy", [True, False])
def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, array_type):
beta = np.random.normal(size=2)
M = len(beta)
X = da.random.random((N, M), chunks=(N // nchunks, M))
y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,))

if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

X, y = persist(X, y)
if array_type != "numpy":
X, y = persist(X, y)

result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs)
test_vec = np.random.normal(size=2)
test_vec = maybe_to_cupy(test_vec, X)

f = reg.add_reg_f(family.pointwise_loss, lam)

opt = f(result, X, y).compute()
test_val = f(test_vec, X, y).compute()
opt = maybe_compute(f(result, X, y))
test_val = maybe_compute(f(test_vec, X, y))

assert opt < test_val

Expand All @@ -138,12 +148,10 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy):
],
)
@pytest.mark.parametrize("scheduler", ["synchronous", "threading", "multiprocessing"])
@pytest.mark.parametrize("is_cupy", [True, False])
def test_determinism(func, kwargs, scheduler, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_determinism(func, kwargs, scheduler, array_type):
X, y = make_intercept_data(1000, 10)
if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

with dask.config.set(scheduler=scheduler):
a = func(X, y, **kwargs)
Expand Down
9 changes: 8 additions & 1 deletion dask_glm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def normalize_inputs(X, y, *args, **kwargs):
mean = (
mean
if len(intercept_idx[0])
else np.zeros_like(X._meta, shape=mean.shape)
else zeros(mean.shape, getattr(X, "_meta", None))
)
Xn = (X - mean) / std
out = algo(Xn, y, *args, **kwargs).copy()
Expand All @@ -39,6 +39,13 @@ def normalize_inputs(X, y, *args, **kwargs):
return normalize_inputs


def zeros(shape, arr=None, **kwargs):
if arr:
return np.zeros_like(arr, shape=shape, **kwargs)
else:
return np.zeros(shape=shape, **kwargs)


def sigmoid(x):
"""Sigmoid function of x."""
return 1 / (1 + exp(-x))
Expand Down

0 comments on commit 7ee6764

Please sign in to comment.