diff --git a/.gitignore b/.gitignore index ccb60ab..192e38b 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ wheels/ *.egg dask_glm/.ropeproject/* .ipynb_checkpoints/ +.idea/* diff --git a/dask_glm/datasets.py b/dask_glm/datasets.py index 75fbfc4..68c2cb4 100644 --- a/dask_glm/datasets.py +++ b/dask_glm/datasets.py @@ -1,5 +1,6 @@ import numpy as np import dask.array as da +from dask_glm.utils import exp def make_classification(n_samples=1000, n_features=100, n_informative=2, scale=1.0, @@ -22,3 +23,15 @@ def make_regression(n_samples=1000, n_features=100, n_informative=2, scale=1.0, z0 = X[:, informative_idx].dot(beta[informative_idx]) y = da.random.random(z0.shape, chunks=(chunksize,)) return X, y + + +def make_poisson(n_samples=1000, n_features=100, n_informative=2, scale=1.0, + chunksize=100): + X = da.random.normal(0, 1, size=(n_samples, n_features), + chunks=(chunksize, n_features)) + informative_idx = np.random.choice(n_features, n_informative) + beta = (np.random.random(n_features) - 1) * scale + z0 = X[:, informative_idx].dot(beta[informative_idx]) + rate = exp(z0) + y = da.random.poisson(rate, size=1, chunks=(chunksize,)) + return X, y diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py index 56064a5..0dc3730 100644 --- a/dask_glm/estimators.py +++ b/dask_glm/estimators.py @@ -6,7 +6,7 @@ from . import algorithms from . import families from .utils import ( - sigmoid, dot, add_intercept, mean_squared_error, accuracy_score + sigmoid, dot, add_intercept, mean_squared_error, accuracy_score, exp, poisson_deviance ) @@ -120,8 +120,9 @@ def score(self, X, y): class LinearRegression(_GLM): """ - Ordinary Lest Square regression + Ordinary Least Squares regression """ + @property def family(self): return families.Normal @@ -132,3 +133,52 @@ def predict(self, X): def score(self, X, y): return mean_squared_error(y, self.predict(X)) + + +class PoissonRegression(_GLM): + """ + Parameters + ---------- + fit_intercept : bool, default True + Specifies if a constant (a.k.a. bias or intercept) should be + added to the decision function. + solver : {'admm', 'gradient_descent', 'newton', 'bfgs', 'proximal_grad'} + Solver to use. See :ref:`algorithms` for details + regularizer : {'l1', 'l2'} + Regularizer to use. See :ref:`regularizers` for details. + Only used with ``admm`` and ``proximal_grad`` solvers. + max_iter : int, default 100 + Maximum number of iterations taken for the solvers to converge + tol : float, default 1e-4 + Tolerance for stopping criteria. Ignored for ``admm`` solver + lambduh : float, default 1.0 + Only used with ``admm`` and ``proximal_grad`` solvers + rho, over_relax, abstol, reltol : float + Only used with the ``admm`` solver. See :ref:`algorithms.admm` + for details + + Attributes + ---------- + coef_ : array, shape (n_classes, n_features) + intercept_ : float + + Examples + -------- + >>> from dask_glm.datasets import make_poisson + >>> X, y = make_poisson() + >>> pr = PoissonRegression() + >>> pr.fit(X, y) + >>> pr.predict(X) + >>> pr.get_deviance(X, y) + """ + + @property + def family(self): + return families.Poisson + + def predict(self, X): + X_ = self._maybe_add_intercept(X) + return exp(dot(X_, self._coef)) + + def get_deviance(self, X, y): + return poisson_deviance(y, self.predict(X)) diff --git a/dask_glm/families.py b/dask_glm/families.py index 743b3f4..119351d 100644 --- a/dask_glm/families.py +++ b/dask_glm/families.py @@ -4,7 +4,6 @@ class Logistic(object): - @staticmethod def loglike(Xbeta, y): eXbeta = exp(Xbeta) @@ -38,7 +37,7 @@ def hessian(Xbeta, X): class Normal(object): @staticmethod def loglike(Xbeta, y): - return ((y - Xbeta)**2).sum() + return ((y - Xbeta) ** 2).sum() @staticmethod def pointwise_loss(beta, X, y): @@ -59,3 +58,39 @@ def gradient(Xbeta, X, y): @staticmethod def hessian(Xbeta, X): return 2 * dot(X.T, X) + + +class Poisson(object): + """ + This implements Poisson regression for count data. + See https://en.wikipedia.org/wiki/Poisson_regression. + """ + + @staticmethod + def loglike(Xbeta, y): + eXbeta = exp(Xbeta) + yXbeta = y * Xbeta + return (eXbeta - yXbeta).sum() + + @staticmethod + def pointwise_loss(beta, X, y): + beta, y = beta.ravel(), y.ravel() + Xbeta = X.dot(beta) + return Poisson.loglike(Xbeta, y) + + @staticmethod + def pointwise_gradient(beta, X, y): + beta, y = beta.ravel(), y.ravel() + Xbeta = X.dot(beta) + return Poisson.gradient(Xbeta, X, y) + + @staticmethod + def gradient(Xbeta, X, y): + eXbeta = exp(Xbeta) + return dot(X.T, eXbeta - y) + + @staticmethod + def hessian(Xbeta, X): + eXbeta = exp(Xbeta) + x_diag_eXbeta = eXbeta[:, None] * X + return dot(X.T, x_diag_eXbeta) diff --git a/dask_glm/tests/test_algos_families.py b/dask_glm/tests/test_algos_families.py index 5c75f83..44b244f 100644 --- a/dask_glm/tests/test_algos_families.py +++ b/dask_glm/tests/test_algos_families.py @@ -8,7 +8,7 @@ from dask_glm.algorithms import (newton, bfgs, proximal_grad, gradient_descent, admm) -from dask_glm.families import Logistic, Normal +from dask_glm.families import Logistic, Normal, Poisson from dask_glm.regularizers import L1, L2 from dask_glm.utils import sigmoid, make_y @@ -63,7 +63,7 @@ def test_methods(N, p, seed, opt): ]) @pytest.mark.parametrize('N', [1000]) @pytest.mark.parametrize('nchunks', [1, 10]) -@pytest.mark.parametrize('family', [Logistic, Normal]) +@pytest.mark.parametrize('family', [Logistic, Normal, Poisson]) def test_basic_unreg_descent(func, kwargs, N, nchunks, family): beta = np.random.normal(size=2) M = len(beta) @@ -87,7 +87,7 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family): ]) @pytest.mark.parametrize('N', [1000]) @pytest.mark.parametrize('nchunks', [1, 10]) -@pytest.mark.parametrize('family', [Logistic, Normal]) +@pytest.mark.parametrize('family', [Logistic, Normal, Poisson]) @pytest.mark.parametrize('lam', [0.01, 1.2, 4.05]) @pytest.mark.parametrize('reg', [L1, L2]) def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg): diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py index 5e0de11..a0b75d2 100644 --- a/dask_glm/tests/test_estimators.py +++ b/dask_glm/tests/test_estimators.py @@ -1,7 +1,7 @@ import pytest -from dask_glm.estimators import LogisticRegression, LinearRegression -from dask_glm.datasets import make_classification, make_regression +from dask_glm.estimators import LogisticRegression, LinearRegression, PoissonRegression +from dask_glm.datasets import make_classification, make_regression, make_poisson from dask_glm.algorithms import _solvers from dask_glm.regularizers import _regularizers @@ -39,6 +39,10 @@ def test_lr_init(solver): LogisticRegression(solver=solver) +def test_pr_init(solver): + PoissonRegression(solver=solver) + + @pytest.mark.parametrize('fit_intercept', [True, False]) def test_fit(fit_intercept): X, y = make_classification(n_samples=100, n_features=5, chunksize=10) @@ -71,6 +75,19 @@ def test_big(fit_intercept): assert lr.intercept_ is not None +@pytest.mark.parametrize('fit_intercept', [True, False]) +def test_poisson_fit(fit_intercept): + import dask + dask.set_options(get=dask.get) + X, y = make_poisson() + pr = PoissonRegression(fit_intercept=fit_intercept) + pr.fit(X, y) + pr.predict(X) + pr.get_deviance(X, y) + if fit_intercept: + assert pr.intercept_ is not None + + def test_in_pipeline(): from sklearn.pipeline import make_pipeline X, y = make_classification(n_samples=100, n_features=5, chunksize=10) diff --git a/dask_glm/utils.py b/dask_glm/utils.py index e3a9919..a08c41f 100644 --- a/dask_glm/utils.py +++ b/dask_glm/utils.py @@ -145,6 +145,10 @@ def accuracy_score(y_true, y_pred): return (y_true == y_pred).mean() +def poisson_deviance(y_true, y_pred): + return 2 * (y_true * log1p(y_true / y_pred) - (y_true - y_pred)).sum() + + try: import sparse except ImportError: