From b2b6f108d9a666419601addab9526c162ae038c1 Mon Sep 17 00:00:00 2001 From: postelrich Date: Thu, 4 May 2017 18:01:26 -0400 Subject: [PATCH] Elastic Net Regularizer (#49) * create regularizer base class, elastic net regularization, update readme with dev setup. * vectorize proximal operator for elastic net * fix l2, write tests for regularizers * add tests for elastic net * fix flake, change to string * use base regularizer class to retrieve subclasses via string. * add tests for get * fix docstrings, add hessian for l1. --- README.rst | 14 ++ dask_glm/algorithms.py | 10 +- dask_glm/regularizers.py | 152 +++++++++------ dask_glm/tests/test_admm.py | 2 +- dask_glm/tests/test_algos_families.py | 4 +- dask_glm/tests/test_estimators.py | 7 +- dask_glm/tests/test_regularizers.py | 181 ++++++++++++++++++ ...ElasticNetProximalOperatorDerivation.ipynb | 94 +++++++++ 8 files changed, 396 insertions(+), 68 deletions(-) create mode 100644 dask_glm/tests/test_regularizers.py create mode 100644 notebooks/ElasticNetProximalOperatorDerivation.ipynb diff --git a/README.rst b/README.rst index 9eeddd7..2b4b61d 100644 --- a/README.rst +++ b/README.rst @@ -5,5 +5,19 @@ Generalized Linear Models in Dask *This library is not ready for use.* +Developer Setup +--------------- +Setup environment (from repo directory):: + + conda create env + source activate dask_glm + pip install -e . + +Run tests:: + + py.test + + + .. |Build Status| image:: https://travis-ci.org/dask/dask-glm.svg?branch=master :target: https://travis-ci.org/dask/dask-glm diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 981bdae..e7bfb4d 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -24,7 +24,7 @@ from dask_glm.utils import dot, exp, log1p from dask_glm.families import Logistic -from dask_glm.regularizers import L1, _regularizers +from dask_glm.regularizers import Regularizer def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val, @@ -150,12 +150,12 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic): return beta -def admm(X, y, regularizer=L1, lamduh=0.1, rho=1, over_relax=1, +def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1, max_iter=250, abstol=1e-4, reltol=1e-2, family=Logistic): pointwise_loss = family.pointwise_loss pointwise_gradient = family.pointwise_gradient - regularizer = _regularizers.get(regularizer, regularizer) # string + regularizer = Regularizer.get(regularizer) def create_local_gradient(func): @functools.wraps(func) @@ -319,7 +319,7 @@ def bfgs(X, y, max_iter=500, tol=1e-14, family=Logistic): return beta -def proximal_grad(X, y, regularizer=L1, lamduh=0.1, family=Logistic, +def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic, max_iter=100, tol=1e-8, verbose=False): n, p = X.shape @@ -331,7 +331,7 @@ def proximal_grad(X, y, regularizer=L1, lamduh=0.1, family=Logistic, recalcRate = 10 backtrackMult = firstBacktrackMult beta = np.zeros(p) - regularizer = _regularizers.get(regularizer, regularizer) # string + regularizer = Regularizer.get(regularizer) if verbose: print('# -f |df/f| |dx/x| step') diff --git a/dask_glm/regularizers.py b/dask_glm/regularizers.py index 1398e6b..4587f3d 100644 --- a/dask_glm/regularizers.py +++ b/dask_glm/regularizers.py @@ -3,85 +3,125 @@ import numpy as np -class L2(object): +class Regularizer(object): + """Abstract base class for regularization object. - @staticmethod - def proximal_operator(beta, t): - return 1 / (1 + t) * beta + Defines the set of methods required to create a new regularization object. This includes + the regularization functions itself and its gradient, hessian, and proximal operator. + """ + name = '_base' - @staticmethod - def hessian(beta): - return 2 * np.eye(len(beta)) + def f(self, beta): + """Regularization function.""" + raise NotImplementedError - @staticmethod - def add_reg_hessian(hess, lam): - def wrapped(beta, *args): - return hess(beta, *args) + lam * L2.hessian(beta) - return wrapped + def gradient(self, beta): + """Gradient of regularization function.""" + raise NotImplementedError + + def hessian(self, beta): + """Hessian of regularization function.""" + raise NotImplementedError - @staticmethod - def f(beta): - return (beta**2).sum() + def proximal_operator(self, beta, t): + """Proximal operator for regularization function.""" + raise NotImplementedError - @staticmethod - def add_reg_f(f, lam): + def add_reg_f(self, f, lam): + """Add regularization function to other function.""" def wrapped(beta, *args): - return f(beta, *args) + lam * L2.f(beta) + return f(beta, *args) + lam * self.f(beta) return wrapped - @staticmethod - def gradient(beta): - return 2 * beta + def add_reg_grad(self, grad, lam): + """Add regularization gradient to other gradient function.""" + def wrapped(beta, *args): + return grad(beta, *args) + lam * self.gradient(beta) + return wrapped - @staticmethod - def add_reg_grad(grad, lam): + def add_reg_hessian(self, hess, lam): + """Add regularization hessian to other hessian function.""" def wrapped(beta, *args): - return grad(beta, *args) + lam * L2.gradient(beta) + return hess(beta, *args) + lam * self.hessian(beta) return wrapped + @classmethod + def get(cls, obj): + if isinstance(obj, cls): + return obj + elif isinstance(obj, str): + return {o.name: o for o in cls.__subclasses__()}[obj]() + raise TypeError('Not a valid regularizer object.') -class L1(object): - @staticmethod - def proximal_operator(beta, t): - z = np.maximum(0, beta - t) - np.maximum(0, -beta - t) - return z +class L2(Regularizer): + """L2 regularization.""" + name = 'l2' - @staticmethod - def hessian(beta): - raise ValueError('l1 norm is not twice differentiable!') + def f(self, beta): + return (beta**2).sum() / 2 - @staticmethod - def add_reg_hessian(hess, lam): - def wrapped(beta, *args): - return hess(beta, *args) + lam * L1.hessian(beta) - return wrapped + def gradient(self, beta): + return beta - @staticmethod - def f(beta): - return (np.abs(beta)).sum() + def hessian(self, beta): + return np.eye(len(beta)) - @staticmethod - def add_reg_f(f, lam): - def wrapped(beta, *args): - return f(beta, *args) + lam * L1.f(beta) - return wrapped + def proximal_operator(self, beta, t): + return 1 / (1 + t) * beta + + +class L1(Regularizer): + """L1 regularization.""" + name = 'l1' - @staticmethod - def gradient(beta): + def f(self, beta): + return (np.abs(beta)).sum() + + def gradient(self, beta): if np.any(np.isclose(beta, 0)): raise ValueError('l1 norm is not differentiable at 0!') else: return np.sign(beta) - @staticmethod - def add_reg_grad(grad, lam): - def wrapped(beta, *args): - return grad(beta, *args) + lam * L1.gradient(beta) - return wrapped + def hessian(self, beta): + if np.any(np.isclose(beta, 0)): + raise ValueError('l1 norm is not twice differentiable at 0!') + return np.zeros((beta.shape[0], beta.shape[0])) + + def proximal_operator(self, beta, t): + z = np.maximum(0, beta - t) - np.maximum(0, -beta - t) + return z + + +class ElasticNet(Regularizer): + """Elastic net regularization.""" + name = 'elastic_net' + + def __init__(self, weight=0.5): + self.weight = weight + self.l1 = L1() + self.l2 = L2() + + def _weighted(self, left, right): + return self.weight * left + (1 - self.weight) * right + + def f(self, beta): + return self._weighted(self.l1.f(beta), self.l2.f(beta)) + + def gradient(self, beta): + return self._weighted(self.l1.gradient(beta), self.l2.gradient(beta)) + + def hessian(self, beta): + return self._weighted(self.l1.hessian(beta), self.l2.hessian(beta)) + def proximal_operator(self, beta, t): + """See notebooks/ElasticNetProximalOperatorDerivation.ipynb for derivation.""" + g = self.weight * t -_regularizers = { - 'l1': L1, - 'l2': L2, -} + @np.vectorize + def func(b): + if b <= g: + return 0 + return (b - g * np.sign(b)) / (t - g + 1) + return beta diff --git a/dask_glm/tests/test_admm.py b/dask_glm/tests/test_admm.py index 32a7a2e..1d97cde 100644 --- a/dask_glm/tests/test_admm.py +++ b/dask_glm/tests/test_admm.py @@ -52,6 +52,6 @@ def test_admm_with_large_lamduh(N, p, nchunks): y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,)) X, y = persist(X, y) - z = admm(X, y, regularizer=L1, lamduh=1e4, rho=20, max_iter=500) + z = admm(X, y, regularizer=L1(), lamduh=1e4, rho=20, max_iter=500) assert np.allclose(z, np.zeros(p), atol=1e-4) diff --git a/dask_glm/tests/test_algos_families.py b/dask_glm/tests/test_algos_families.py index 44b244f..773f0c8 100644 --- a/dask_glm/tests/test_algos_families.py +++ b/dask_glm/tests/test_algos_families.py @@ -9,7 +9,7 @@ from dask_glm.algorithms import (newton, bfgs, proximal_grad, gradient_descent, admm) from dask_glm.families import Logistic, Normal, Poisson -from dask_glm.regularizers import L1, L2 +from dask_glm.regularizers import Regularizer from dask_glm.utils import sigmoid, make_y @@ -89,7 +89,7 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family): @pytest.mark.parametrize('nchunks', [1, 10]) @pytest.mark.parametrize('family', [Logistic, Normal, Poisson]) @pytest.mark.parametrize('lam', [0.01, 1.2, 4.05]) -@pytest.mark.parametrize('reg', [L1, L2]) +@pytest.mark.parametrize('reg', [r() for r in Regularizer.__subclasses__()]) def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg): beta = np.random.normal(size=2) M = len(beta) diff --git a/dask_glm/tests/test_estimators.py b/dask_glm/tests/test_estimators.py index a0b75d2..fc913f5 100644 --- a/dask_glm/tests/test_estimators.py +++ b/dask_glm/tests/test_estimators.py @@ -2,17 +2,16 @@ from dask_glm.estimators import LogisticRegression, LinearRegression, PoissonRegression from dask_glm.datasets import make_classification, make_regression, make_poisson -from dask_glm.algorithms import _solvers -from dask_glm.regularizers import _regularizers +from dask_glm.regularizers import Regularizer -@pytest.fixture(params=_solvers.keys()) +@pytest.fixture(params=[r() for r in Regularizer.__subclasses__()]) def solver(request): """Parametrized fixture for all the solver names""" return request.param -@pytest.fixture(params=_regularizers.keys()) +@pytest.fixture(params=[r() for r in Regularizer.__subclasses__()]) def regularizer(request): """Parametrized fixture for all the regularizer names""" return request.param diff --git a/dask_glm/tests/test_regularizers.py b/dask_glm/tests/test_regularizers.py new file mode 100644 index 0000000..6c875af --- /dev/null +++ b/dask_glm/tests/test_regularizers.py @@ -0,0 +1,181 @@ +import numpy as np +import numpy.testing as npt +import pytest +from dask_glm import regularizers as regs + + +@pytest.mark.parametrize('func,args', [ + ('f', [0]), + ('gradient', [0]), + ('hessian', [0]), + ('proximal_operator', [0, 1]) +]) +def test_base_class_raises_notimplementederror(func, args): + with pytest.raises(NotImplementedError): + getattr(regs.Regularizer(), func)(*args) + + +class FooRegularizer(regs.Regularizer): + + def f(self, beta): + return beta + 1 + + def gradient(self, beta): + return beta + 1 + + def hessian(self, beta): + return beta + 1 + + +@pytest.mark.parametrize('func', [ + 'add_reg_f', + 'add_reg_grad', + 'add_reg_hessian' +]) +def test_add_reg_funcs(func): + def foo(x): + return x**2 + new_func = getattr(FooRegularizer(), func)(foo, 1) + assert callable(new_func) + assert new_func(2) == 7 + + +def test_regularizer_get_passes_through_instance(): + x = FooRegularizer() + assert regs.Regularizer.get(x) == x + + +def test_regularizer_get_unnamed_raises(): + with pytest.raises(KeyError): + regs.Regularizer.get('foo') + + +def test_regularizer_gets_from_name(): + class Foo(regs.Regularizer): + name = 'foo' + assert isinstance(regs.Regularizer.get('foo'), Foo) + + +@pytest.mark.parametrize('beta,expected', [ + (np.array([0, 0, 0]), 0), + (np.array([1, 2, 3]), 7) +]) +def test_l2_function(beta, expected): + assert regs.L2().f(beta) == expected + + +@pytest.mark.parametrize('beta', [ + np.array([0, 0, 0]), + np.array([1, 2, 3]) +]) +def test_l2_gradient(beta): + npt.assert_array_equal(regs.L2().gradient(beta), beta) + + +@pytest.mark.parametrize('beta', [ + np.array([0, 0, 0]), + np.array([1, 2, 3]) +]) +def test_l2_hessian(beta): + npt.assert_array_equal(regs.L2().hessian(beta), np.eye(len(beta))) + + +@pytest.mark.parametrize('beta,expected', [ + (np.array([0, 0, 0]), np.array([0, 0, 0])), + (np.array([1, 2, 3]), np.array([0.5, 1, 1.5])) +]) +def test_l2_proximal_operator(beta, expected): + npt.assert_array_equal(regs.L2().proximal_operator(beta, 1), expected) + + +@pytest.mark.parametrize('beta,expected', [ + (np.array([0, 0, 0]), 0), + (np.array([-1, 2, 3]), 6) +]) +def test_l1_function(beta, expected): + assert regs.L1().f(beta) == expected + + +@pytest.mark.parametrize('beta,expected', [ + (np.array([1, 2, 3]), np.array([1, 1, 1])), + (np.array([-1, 2, 3]), np.array([-1, 1, 1])) +]) +def test_l1_gradient(beta, expected): + npt.assert_array_equal(regs.L1().gradient(beta), expected) + + +@pytest.mark.parametrize('beta', [ + np.array([0.00000001, 1, 2]), + np.array([-0.00000001, 1, 2]), + np.array([0, 0, 0]) +]) +def test_l1_gradient_raises_near_zero(beta): + with pytest.raises(ValueError): + regs.L1().gradient(beta) + + +def test_l1_hessian(): + npt.assert_array_equal(regs.L1().hessian(np.array([1, 2])), + np.array([[0, 0], [0, 0]])) + + +def test_l1_hessian_raises(): + with pytest.raises(ValueError): + regs.L1().hessian(np.array([0, 0, 0])) + + +@pytest.mark.parametrize('beta,expected', [ + (np.array([0, 0, 0]), np.array([0, 0, 0])), + (np.array([1, 2, 3]), np.array([0, 1, 2])) +]) +def test_l1_proximal_operator(beta, expected): + npt.assert_array_equal(regs.L1().proximal_operator(beta, 1), expected) + + +@pytest.mark.parametrize('beta,expected', [ + (np.array([0, 0, 0]), 0), + (np.array([1, 2, 3]), 6.5) +]) +def test_elastic_net_function(beta, expected): + assert regs.ElasticNet().f(beta) == expected + + +def test_elastic_net_function_zero_weight_is_l2(): + beta = np.array([1, 2, 3]) + assert regs.ElasticNet(weight=0).f(beta) == regs.L2().f(beta) + + +def test_elastic_net_function_zero_weight_is_l1(): + beta = np.array([1, 2, 3]) + assert regs.ElasticNet(weight=1).f(beta) == regs.L1().f(beta) + + +def test_elastic_net_gradient(): + beta = np.array([1, 2, 3]) + npt.assert_array_equal(regs.ElasticNet(weight=0.5).gradient(beta), np.array([1, 1.5, 2])) + + +def test_elastic_net_gradient_zero_weight_is_l2(): + beta = np.array([1, 2, 3]) + npt.assert_array_equal(regs.ElasticNet(weight=0).gradient(beta), regs.L2().gradient(beta)) + + +def test_elastic_net_gradient_zero_weight_is_l1(): + beta = np.array([1, 2, 3]) + npt.assert_array_equal(regs.ElasticNet(weight=1).gradient(beta), regs.L1().gradient(beta)) + + +def test_elastic_net_hessian(): + beta = np.array([1, 2, 3]) + npt.assert_array_equal(regs.ElasticNet(weight=0.5).hessian(beta), + np.eye(len(beta)) * regs.ElasticNet().weight) + + +def test_elastic_net_hessian_raises(): + with pytest.raises(ValueError): + regs.ElasticNet(weight=0.5).hessian(np.array([0, 1, 2])) + + +def test_elastic_net_proximal_operator(): + beta = np.array([1, 2, 3]) + npt.assert_array_equal(regs.ElasticNet(weight=0.5).proximal_operator(beta, 1), beta) diff --git a/notebooks/ElasticNetProximalOperatorDerivation.ipynb b/notebooks/ElasticNetProximalOperatorDerivation.ipynb new file mode 100644 index 0000000..20387ca --- /dev/null +++ b/notebooks/ElasticNetProximalOperatorDerivation.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Title: Proximal Operator for Elastic Net Regularization \n", + "Author: Christopher White, Richard Postelnik \n", + "Date: May 3, 2017 \n", + "\n", + "# Derivation of the Proximal Operator for Elastic Net Regularization\n", + "\n", + "The proximal operator for a function $f$ is defined as:\n", + "\n", + "$$prox_f(v, \\lambda)= \\arg\\min_x \\big(f(x) + \\frac{1}{2\\lambda}\\| x-v\\|^2\\big)$$\n", + "\n", + "Elastic net regularization is defined as a convex combination of the $\\ell_1$ and $\\ell_2$ norm:\n", + "\n", + "$$\\alpha\\| x\\|_1 + \\frac{(1 - \\alpha)}{2}\\| x\\|^2_2$$\n", + "\n", + "where $\\alpha\\in[0, 1]$ and the half before the $\\ell_2$ norm is added for purely for convenience in derivation.\n", + "\n", + "Plugging this into the proximal operator definition:\n", + "\n", + "$$prox_f(v, \\lambda)=\\arg\\min_x\\big(\\alpha\\| x\\|_1 + \\frac{(1 - \\alpha)}{2}\\| x\\|^2_2 + \\frac{1}{2\\lambda}\\| x-v\\|^2\\big)$$\n", + "\n", + "The first order optimality condition states that $0$ is in the subgradient:\n", + "\n", + "$$\\alpha\\partial\\Vert x\\Vert_1 + (1-\\alpha)x_i + \\frac{1}{\\lambda}(x_i - v_i) \\ni 0$$\n", + "\n", + "where:\n", + "\n", + "$$\\alpha\\partial\\Vert x\\Vert_1 = \\left\\{\n", + "\\begin{array}{ll}\n", + " sign(x)\\alpha & x \\neq 0 \\\\\n", + " [-\\alpha, \\alpha] & x=0 \\\\\n", + "\\end{array} \n", + "\\right.$$\n", + "\n", + "When x is not 0 we have:\n", + "\n", + "$$x = \\frac{v-\\lambda sign(v)}{\\lambda - \\lambda\\alpha + 1}$$\n", + "\n", + "Plugging in values for positive and negative x we find the above holds when:\n", + "\n", + "$$|v| > \\lambda\\alpha$$\n", + "\n", + "Likewise, when x = 0 the condition is:\n", + "\n", + "$$|v| \\leq \\lambda\\alpha$$\n", + "\n", + "And we find that the proximal operator for elastic net is:\n", + "\n", + "$$prox_f(v, \\lambda) = \\left\\{\n", + "\\begin{array}{ll}\n", + " \\frac{v-\\lambda sign(v)}{\\lambda - \\lambda\\alpha + 1} & |v| > \\lambda\\alpha \\\\\n", + " 0 & |v|\\leq \\lambda\\alpha \\\\\n", + "\\end{array} \n", + "\\right.$$\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}