Added primitives for speculative decoding and tests #2719
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Tests | |
on: | |
push: | |
branches: [ main ] | |
pull_request: | |
jobs: | |
run-tests: | |
strategy: | |
matrix: | |
include: | |
- { model: 'bigscience/bloom-560m', os: 'ubuntu', python-version: '3.8' } | |
- { model: 'bigscience/bloom-560m', os: 'ubuntu', python-version: '3.11' } | |
- { model: 'Maykeye/TinyLLama-v0', os: 'ubuntu', python-version: '3.8' } | |
- { model: 'Maykeye/TinyLLama-v0', os: 'ubuntu', python-version: '3.11' } | |
- { model: 'Maykeye/TinyLLama-v0', os: 'macos', python-version: '3.10' } | |
- { model: 'Maykeye/TinyLLama-v0', os: 'macos', python-version: '3.11' } | |
- { model: 'artek0chumak/TestMixtral', os: 'ubuntu', python-version: '3.8' } | |
- { model: 'artek0chumak/TestMixtral', os: 'ubuntu', python-version: '3.11' } | |
fail-fast: false | |
runs-on: ${{ matrix.os }}-latest | |
timeout-minutes: 20 | |
steps: | |
- name: Increase swap space | |
if: ${{ matrix.os == 'ubuntu' }} | |
uses: pierotofy/set-swap-space@master | |
with: | |
swap-size-gb: 10 | |
- name: Checkout | |
uses: actions/checkout@v3 | |
- name: Set up Python | |
uses: actions/setup-python@v3 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Cache dependencies | |
uses: actions/cache@v3 | |
with: | |
path: ~/.cache/pip | |
key: Key-v1-${{ matrix.python-version }}-${{ hashFiles('setup.cfg') }} | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install .[dev] | |
- name: Test | |
run: | | |
set -x # Print executed commands | |
export MODEL_NAME="${{ matrix.model }}" | |
export REF_NAME="${{ matrix.model }}" | |
export ADAPTER_NAME="${{ matrix.model == 'bigscience/bloom-560m' && 'artek0chumak/bloom-560m-safe-peft' || '' }}" | |
# [Step 1] Set up a tiny test swarm (see https://github.com/bigscience-workshop/petals/wiki/Launch-your-own-swarm) | |
python -m petals.cli.run_dht \ | |
--identity_path tests/bootstrap.id --host_maddrs /ip4/127.0.0.1/tcp/31337 &> bootstrap.log & | |
BOOTSTRAP_PID=$! | |
export INITIAL_PEERS=/ip4/127.0.0.1/tcp/31337/p2p/QmS9KwZptnVdB9FFV7uGgaTq4sEKBwcYeKZDfSpyKDUd1g | |
# ^-- multiaddr in INITIAL_PEERS is determined by --identity_path and --host_maddrs | |
until [ -s bootstrap.log ]; do sleep 5; done # wait for DHT init | |
export RUN_SERVER="python -m petals.cli.run_server $MODEL_NAME \ | |
--device cpu --torch_dtype float32 --initial_peers $INITIAL_PEERS" | |
export TENSOR_PARALLEL_ARGS="${{ matrix.model == 'bigscience/bloom-560m' && '--tensor_parallel_devices cpu cpu' || '' }}" | |
$RUN_SERVER --adapters $ADAPTER_NAME --num_blocks 5 --throughput 1 --mean_balance_check_period 10 &> server1.log & | |
SERVER1_PID=$! | |
# ^-- rebalacing test: this server chooses blocks 0:5, then sees a gap in the swarm and moves there | |
sleep 10 # wait for the 1st server to choose blocks | |
$RUN_SERVER --adapters $ADAPTER_NAME --block_indices 0:5 --throughput 1 --identity_path tests/server2.id &> server2.log & | |
SERVER2_PID=$! | |
$RUN_SERVER --adapters $ADAPTER_NAME --num_blocks 14 --throughput auto \ | |
--attn_cache_tokens 2048 --max_chunk_size_bytes 1024 &> server3.log & | |
SERVER3_PID=$! | |
# ^-- chunking test | |
$RUN_SERVER $TENSOR_PARALLEL_ARGS --block_indices 0:2 --throughput auto &> server4.log & | |
SERVER4_PID=$! | |
# ^-- tensor parallelism test (not compatible with adapters yet) | |
sleep 5 # wait for the log files to appear | |
tail -n 100 -f bootstrap.log server*.log & | |
LOGGER_PID=$! | |
sleep 30 # wait for servers to eval throughput, download layers, and rebalance | |
kill -0 $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID # ensure all peers survived init | |
# [Step 2] Run PyTest | |
# Share disk cache between Petals servers, clients, and HF Transformers | |
export TRANSFORMERS_CACHE=~/.cache/petals | |
# Necessary for @pytest.mark.forked to work properly on macOS, see https://github.com/kevlened/pytest-parallel/issues/93 | |
export no_proxy=* | |
export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES | |
# Limit default ClientConfig.max_retries to see tracebacks instead of retrying indefinitely | |
export PETALS_MAX_RETRIES=10 | |
pytest tests --durations=0 --durations-min=1.0 -v | |
# [Step 3] Check if benchmarks work (their results here are meaningless since it's a tiny swarm of CPU servers) | |
python benchmarks/benchmark_inference.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \ | |
--seq_len 3 | |
python benchmarks/benchmark_forward.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \ | |
--seq_len 3 --batch_size 3 --n_steps 1 | |
python benchmarks/benchmark_training.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \ | |
--seq_len 3 --batch_size 3 --pre_seq_len 1 --n_steps 1 --task cls | |
python benchmarks/benchmark_training.py --model $MODEL_NAME --initial_peers $INITIAL_PEERS --torch_dtype float32 \ | |
--seq_len 3 --batch_size 3 --pre_seq_len 1 --n_steps 1 --task causal_lm | |
# [Step 4] Clean up | |
kill -s SIGINT $BOOTSTRAP_PID $SERVER1_PID $SERVER2_PID $SERVER3_PID $SERVER4_PID $LOGGER_PID |