From 478a6a6f78b6badf6d8c77565980a891f640c4d6 Mon Sep 17 00:00:00 2001 From: Dmitrii Emelianenko Date: Tue, 17 Dec 2024 22:03:08 +0100 Subject: [PATCH] Add custom CI workflow (#4) --- .github/workflows/poolside-nightly-build.yaml | 184 ++++++++++++++++++ poolside-changes.md | 2 + 2 files changed, 186 insertions(+) create mode 100644 .github/workflows/poolside-nightly-build.yaml diff --git a/.github/workflows/poolside-nightly-build.yaml b/.github/workflows/poolside-nightly-build.yaml new file mode 100644 index 0000000000000..eba22ec7f3d2c --- /dev/null +++ b/.github/workflows/poolside-nightly-build.yaml @@ -0,0 +1,184 @@ +# basically a partial copy of ./generated-linux-binary-manywheel-nightly.yml +# as the original version is autogenerated, we would need to manually sync this periodically, +# but this way we avoid conflicts +name: poolside-linux-binary-manywheel + + +on: + # only manual triggers for now + workflow_dispatch: + inputs: + publish: + description: Upload to CodeArtifact + type: choice + required: true + default: true + options: + - true + - false +env: + # Needed for conda builds + ANACONDA_USER: pytorch + BINARY_ENV_FILE: /tmp/env + BUILD_ENVIRONMENT: linux-binary-manywheel + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PYTORCH_FINAL_PACKAGE_DIR: /artifacts + PYTORCH_ROOT: /pytorch + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SKIP_ALL_TESTS: 1 + # All vars below are from the auto-generated ./generated-linux-binary-manywheel-nightly.yml + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: 12.6 + GPU_ARCH_TYPE: cuda + # Note: we might need to fix a specific version of this image or build one ourselves + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.6-main + USE_SPLIT_BUILD: False + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' + MAX_JOBS: 32 + TORCH_CUDA_ARCH_LIST: "8.6;9.0+PTX" + # For publish: + CODEARTIFACT_DOMAIN: poolside + CODEARTIFACT_REPOSITORY: poolside-packages-python-unstable + +concurrency: + # for now we only allow one build at a time + group: poolside-nightly-pytorch-build + cancel-in-progress: true + +jobs: + build: + if: ${{ github.repository_owner == 'poolsideai' }} + runs-on: "ubuntu-22.04-64-pytorchci" + strategy: + matrix: + desired_python: ["3.10", "3.12"] + include: + - desired_python: "3.10" + desired_python_major: "3" + desired_python_minor: "10" + - desired_python: "3.12" + desired_python_major: "3" + desired_python_minor: "12" + env: + BUILD_NAME: manywheel-py${{ matrix.desired_python_major }}_${{ matrix.desired_python_minor }}-cuda12_6 + DESIRED_PYTHON: ${{ matrix.desired_python }} + timeout-minutes: 210 + steps: + - name: Make the env permanent during this workflow (but not the secrets) + shell: bash + run: | + { + echo "PYTORCH_ROOT=${{ env.PYTORCH_ROOT }}" + echo "PACKAGE_TYPE=${{ env.PACKAGE_TYPE }}" + echo "DESIRED_CUDA=${{ env.DESIRED_CUDA }}" + echo "GPU_ARCH_VERSION=${{ env.GPU_ARCH_VERSION }}" + echo "GPU_ARCH_TYPE=${{ env.GPU_ARCH_TYPE }}" + echo "DOCKER_IMAGE=${{ env.DOCKER_IMAGE }}" + echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}" + echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}" + echo "PYTORCH_EXTRA_INSTALL_REQUIREMENTS=${{ env.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}" + echo "ANACONDA_USER=${{ env.ANACONDA_USER }}" + echo "BINARY_ENV_FILE=${{ env.BINARY_ENV_FILE }}" + echo "BUILD_ENVIRONMENT=${{ env.BUILD_ENVIRONMENT }}" + echo "BUILD_NAME=${{ env.BUILD_NAME }}" + echo "PR_NUMBER=${{ env.PR_NUMBER }}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + echo "SHA1=${{ env.SHA1 }}" + echo "USE_SPLIT_BUILD=${{ env.use_split_build }}" + echo "MAX_JOBS=${{ env.MAX_JOBS }}" + echo "TORCH_CUDA_ARCH_LIST=${{ env.TORCH_CUDA_ARCH_LIST }}" + } >> "${GITHUB_ENV} }}" + + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + - name: Checkout PyTorch to pytorch dir + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Build PyTorch binary + run: | + set -x + + mkdir -p ${RUNNER_TEMP}/artifacts + container_name=$(docker run \ + -e BINARY_ENV_FILE \ + -e BUILD_ENVIRONMENT \ + -e DESIRED_CUDA \ + -e DESIRED_DEVTOOLSET \ + -e DESIRED_PYTHON \ + -e GITHUB_ACTIONS \ + -e GPU_ARCH_TYPE \ + -e GPU_ARCH_VERSION \ + -e LIBTORCH_VARIANT \ + -e PACKAGE_TYPE \ + -e PYTORCH_FINAL_PACKAGE_DIR \ + -e PYTORCH_ROOT \ + -e SKIP_ALL_TESTS \ + -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \ + -e USE_SPLIT_BUILD \ + -e MAX_JOBS \ + -e TORCH_CUDA_ARCH_LIST \ + --tty \ + --detach \ + -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w / \ + "${DOCKER_IMAGE}" + ) + docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" + if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh" + else + docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ env.PACKAGE_TYPE }}/build.sh" + fi + docker exec -t "${container_name}" chown -R "$(id -u):$(id -g)" /artifacts + + - name: Cleanup docker + if: always() + shell: bash + run: | + # stop the container for clean worker stop + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + + # upload to github artifacts (as we might not publish) + - uses: actions/upload-artifact@v4.4.0 + with: + name: ${{ env.BUILD_NAME }} + if-no-files-found: error + path: + ${{ runner.temp }}/artifacts/* + + - name: Install publish dependencies + if: github.event.inputs.publish == 'true' + run: | + python -m pip install --upgrade pip + pip install twine + + - name: Configure AWS credentials for publish + if: github.event.inputs.publish == 'true' + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/gh-action-publish-artifacts-role + aws-region: us-east-1 + + - name: Publish to CodeArtifact + if: github.event.inputs.publish == 'true' + run: | + export TWINE_USERNAME=aws + export TWINE_PASSWORD=$(aws codeartifact get-authorization-token --domain ${{ env.CODEARTIFACT_DOMAIN }} --domain-owner ${{ secrets.AWS_ACCOUNT_ID }} --query authorizationToken --output text) + export TWINE_REPOSITORY_URL=$(aws codeartifact get-repository-endpoint --domain ${{ env.CODEARTIFACT_DOMAIN }} --domain-owner ${{ secrets.AWS_ACCOUNT_ID }} --repository ${{ env.CODEARTIFACT_REPOSITORY }} --region us-east-1 --format pypi --query repositoryEndpoint --output text) + twine upload --verbose ${{ runner.temp }}/artifacts/* diff --git a/poolside-changes.md b/poolside-changes.md index e282d6557900d..44c0107fc54b8 100644 --- a/poolside-changes.md +++ b/poolside-changes.md @@ -3,4 +3,6 @@ * [17th Dec 2024] Modified Github Actions build CI forkflow. * Added poolside tag to version name [PR](https://github.com/poolsideai/pytorch/pull/2) * Made cuda\_arch list configurable during CI build [PR](https://github.com/poolsideai/pytorch/pull/3) + * Added a custom CI workflow [PR](https://github.com/poolsideai/pytorch/pull/4) +