Opacus-fusion is an extension of PyTorch Opacus. It allows an fast an efficient DP-SGD training via example-wise weight gradient computation and adaptive clipping.
CUDA Toolkit, CUDNN, CUBLAS should be installed.
- Set envrionment variables
export ENV_NAME={ENV_NAME}
export OPACUS_FUSION_PATH={OPACUS_FUSION_PATH} # Absolute path
export CUTLASS_PATH={CUTLASS_PATH} # Absolute path
- Create conda envrionment
conda create -n $ENV_NAME python=3.9
conda activate $ENV_NAME
- Install torch from https://pytorch.org/get-started/locally/
- Download opacus-fusion from https://github.com/parkbeomsik/opacus-fusion
git clone https://github.com/parkbeomsik/opacus-fusion.git $OPACUS_FUSION_PATH
- Download cutlass from https://github.com/parkbeomsik/cutlass
git clone https://github.com/parkbeomsik/cutlass.git $CUTLASS_PATH
- Install cutlass_wgrad_grouped (It will create
lib
andinclude
inbuild
directory)
cd $OPACUS_FUSION_PATH
cd cutlass_wgrad_grouped
mkdir build && cd build
cmake .. -DCUTLASS_PATH=$CUTLASS_PATH
make install
- Install grad_example_module
cd $OPACUS_FUSION_PATH
cd grad_example_module
python setup.py install
- Install custom_rnn
cd $OPACUS_FUSION_PATH
cd custom_rnn
python setup.py install
- Install opacus-fusion
cd $OPACUS_FUSION_PATH
pip install -e .
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
cd $OPACUS_FUSION_PATH/examples
python benchmark_scripts/profile_time_all.py
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
cd $OPACUS_FUSION_PATH/examples
python benchmark.py --input_size 32 --model_type cnn --architecture resnet18 --dpsgd_mode naive --batch_size 16 --profile_time # DPSGD
python benchmark.py --input_size 32 --model_type cnn --architecture resnet18 --dpsgd_mode reweight --batch_size 16 --profile_time # DPSGD(R)
python benchmark.py --input_size 32 --model_type cnn --architecture resnet18 --dpsgd_mode elegant --batch_size 16 --profile_time # Proposed
cd $OPACUS_FUSION_PATH/examples
python benchmark.py --input_size 32 --model_type cnn --architecture resnet18 --dpsgd_mode naive --batch_size 16 --profile_memory --warm_up_steps 0 --steps 1 # DPSGD
python benchmark.py --input_size 32 --model_type cnn --architecture resnet18 --dpsgd_mode reweight --batch_size 16 --profile_memory --warm_up_steps 0 --steps 1 # DPSGD(R)
python benchmark.py --input_size 32 --model_type cnn --architecture resnet18 --dpsgd_mode elegant --batch_size 16 --profile_memory --warm_up_steps 0 --steps 1 # Proposed