From 6bca13b8128d1803a3edee158c9d3af6d79ba290 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 9 Dec 2024 14:14:03 -0800 Subject: [PATCH] feat: add nix build definitions Signed-off-by: Nicholas Sielicki --- .gitignore | 5 + flake.lock | 283 ++++++++++++++++++ flake.nix | 139 +++++++++ nix/checks.nix | 26 ++ .../0001-add-latest-nccl.nix | 13 + .../0002-use-latest-nccl.nix | 1 + .../0003-nccl-tests-use-mpi.nix | 7 + .../0004-add-ncclAws.nix | 29 ++ .../0005-add-nccl-tests-aws.nix | 8 + .../0001-cuda-dlopen-correctly.patch | 82 +++++ nix/overlays/libfabric/default.nix | 55 ++++ nix/pkgs/aws-ofi-nccl/cleanSource.nix | 76 +++++ nix/pkgs/aws-ofi-nccl/default.nix | 173 +++++++++++ nix/pkgs/ncclWithExtNet.nix | 27 ++ nix/shell.nix | 203 +++++++++++++ nix/ubuntuTestRunners.nix | 45 +++ 16 files changed, 1172 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 nix/checks.nix create mode 100644 nix/cudaPackagesExtensions/0001-add-latest-nccl.nix create mode 100644 nix/cudaPackagesExtensions/0002-use-latest-nccl.nix create mode 100644 nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix create mode 100644 nix/cudaPackagesExtensions/0004-add-ncclAws.nix create mode 100644 nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix create mode 100644 nix/overlays/libfabric/0001-cuda-dlopen-correctly.patch create mode 100644 nix/overlays/libfabric/default.nix create mode 100644 nix/pkgs/aws-ofi-nccl/cleanSource.nix create mode 100644 nix/pkgs/aws-ofi-nccl/default.nix create mode 100644 nix/pkgs/ncclWithExtNet.nix create mode 100644 nix/shell.nix create mode 100644 nix/ubuntuTestRunners.nix diff --git a/.gitignore b/.gitignore index caeaee5b7..5ae053df3 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,8 @@ m4/lt~obsolete.m4 .idea/ .devenv/ .direnv + +result-bin +result +.ctags.d +.tags diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..e5d47105a --- /dev/null +++ b/flake.lock @@ -0,0 +1,283 @@ +{ + "nodes": { + "cuda-packages": { + "inputs": { + "flake-parts": [ + "flake-parts" + ], + "git-hooks-nix": [ + "git-hooks" + ], + "nixpkgs": [ + "nixpkgs" + ], + "treefmt-nix": "treefmt-nix" + }, + "locked": { + "lastModified": 1733297643, + "narHash": "sha256-C2Hs+PtXobPu3ddvmD22c/PI529t7p8+q1agw/Cy9cs=", + "owner": "ConnorBaker", + "repo": "cuda-packages", + "rev": "88efcf8fce87704955c960257a7ef43583569712", + "type": "github" + }, + "original": { + "owner": "ConnorBaker", + "repo": "cuda-packages", + "type": "github" + } + }, + "flake-compat": { + "flake": false, + "locked": { + "lastModified": 1696426674, + "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", + "owner": "edolstra", + "repo": "flake-compat", + "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", + "type": "github" + }, + "original": { + "owner": "edolstra", + "repo": "flake-compat", + "type": "github" + } + }, + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1733312601, + "narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9", + "type": "github" + }, + "original": { + "id": "flake-parts", + "type": "indirect" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "git-hooks": { + "inputs": { + "flake-compat": "flake-compat", + "gitignore": "gitignore", + "nixpkgs": "nixpkgs", + "nixpkgs-stable": "nixpkgs-stable" + }, + "locked": { + "lastModified": 1733318908, + "narHash": "sha256-SVQVsbafSM1dJ4fpgyBqLZ+Lft+jcQuMtEL3lQWx2Sk=", + "rev": "6f4e2a2112050951a314d2733a994fbab94864c6", + "revCount": 930, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/cachix/git-hooks.nix/0.1.930%2Brev-6f4e2a2112050951a314d2733a994fbab94864c6/019392c0-0331-797a-b657-3bd0329ceb15/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/cachix/git-hooks.nix/0.1.928.tar.gz" + } + }, + "gitignore": { + "inputs": { + "nixpkgs": [ + "git-hooks", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1709087332, + "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=", + "owner": "hercules-ci", + "repo": "gitignore.nix", + "rev": "637db329424fd7e46cf4185293b9cc8c88c95394", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "gitignore.nix", + "type": "github" + } + }, + "lib-aggregate": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs-lib": "nixpkgs-lib_2" + }, + "locked": { + "lastModified": 1733055216, + "narHash": "sha256-yB2y7tGJxDI/SDQ0D7b6ocRtLTPm93u8ybdIKQGXRDE=", + "owner": "nix-community", + "repo": "lib-aggregate", + "rev": "f67bf0781c69a46bf3a1469f83c98518aa3054c3", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "lib-aggregate", + "type": "github" + } + }, + "nix-github-actions": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1731952509, + "narHash": "sha256-p4gB3Rhw8R6Ak4eMl8pqjCPOLCZRqaehZxdZ/mbFClM=", + "owner": "nix-community", + "repo": "nix-github-actions", + "rev": "7b5f051df789b6b20d259924d349a9ba3319b226", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nix-github-actions", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1730768919, + "narHash": "sha256-8AKquNnnSaJRXZxc5YmF/WfmxiHX6MMZZasRP6RRQkE=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "a04d33c0c3f1a59a2c1cb0c6e34cd24500e5a1dc", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "lastModified": 1733096140, + "narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" + } + }, + "nixpkgs-lib_2": { + "locked": { + "lastModified": 1733015484, + "narHash": "sha256-qiyO0GrTvbp869U4VGX5GhAZ00fSiPXszvosY1AgKQ8=", + "owner": "nix-community", + "repo": "nixpkgs.lib", + "rev": "0e4fdd4a0ab733276b6d2274ff84ae353f17129e", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nixpkgs.lib", + "type": "github" + } + }, + "nixpkgs-stable": { + "locked": { + "lastModified": 1730741070, + "narHash": "sha256-edm8WG19kWozJ/GqyYx2VjW99EdhjKwbY3ZwdlPAAlo=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "d063c1dd113c91ab27959ba540c0d9753409edf3", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-24.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "lastModified": 1733064805, + "narHash": "sha256-7NbtSLfZO0q7MXPl5hzA0sbVJt6pWxxtGWbaVUDDmjs=", + "rev": "31d66ae40417bb13765b0ad75dd200400e98de84", + "revCount": 715040, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/DeterminateSystems/nixpkgs-weekly/0.1.715040%2Brev-31d66ae40417bb13765b0ad75dd200400e98de84/01938b06-3358-73df-a7e1-598cb884b5d0/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/DeterminateSystems/nixpkgs-weekly/0.1.678339.tar.gz" + } + }, + "root": { + "inputs": { + "cuda-packages": "cuda-packages", + "flake-parts": "flake-parts", + "git-hooks": "git-hooks", + "lib-aggregate": "lib-aggregate", + "nix-github-actions": "nix-github-actions", + "nixpkgs": "nixpkgs_2" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "treefmt-nix": { + "inputs": { + "nixpkgs": [ + "cuda-packages", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1732292307, + "narHash": "sha256-5WSng844vXt8uytT5djmqBCkopyle6ciFgteuA9bJpw=", + "owner": "numtide", + "repo": "treefmt-nix", + "rev": "705df92694af7093dfbb27109ce16d828a79155f", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "treefmt-nix", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..d11cf6664 --- /dev/null +++ b/flake.nix @@ -0,0 +1,139 @@ +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE for licensing information + +{ + description = "aws-ofi-nccl development/build flake."; + + outputs = + { self, flake-parts, ... }@inputs: + let + inherit (inputs.lib-aggregate) lib; + systems = [ + "x86_64-linux" + "aarch64-linux" + ]; + in + flake-parts.lib.mkFlake { inherit inputs; } ( + { withSystem, flake-parts-lib, ... }: + { + inherit systems; + imports = [ + inputs.git-hooks.flakeModule + flake-parts.flakeModules.easyOverlay + ]; + flake = { + githubActionChecks = inputs.nix-github-actions.lib.mkGithubMatrix { + checks = self.outputs.packages.x86_64-linux; + }; + }; + debug = true; + perSystem = + { + system, + config, + final, + pkgs, + ... + }: + { + _module.args.pkgs = import inputs.nixpkgs { + inherit system; + overlays = [ + (import ./nix/overlays/libfabric) + inputs.cuda-packages.overlays.default + inputs.self.overlays.default + ]; + config = { + cudaSupport = true; + cudaForwardCompat = true; + cudaCapabilities = [ + "7.0" + "7.5" + "8.0" + "8.6" + "8.9" + "9.0" + "9.0a" + ]; + allowBroken = true; + allowUnfree = true; + }; + }; + pre-commit.settings = import ./nix/checks.nix { inherit lib; }; + devShells.default = import ./nix/shell.nix { + inherit + pkgs + config + system + inputs + self + ; + }; + overlayAttrs = { + cudaPackagesExtensions = [ + (import ./nix/cudaPackagesExtensions/0001-add-latest-nccl.nix { inherit (pkgs) fetchFromGitHub; }) + (import ./nix/cudaPackagesExtensions/0002-use-latest-nccl.nix) + (import ./nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix { inherit config; }) + (import ./nix/cudaPackagesExtensions/0004-add-ncclAws.nix { + inherit lib config; + inherit (pkgs) symlinkJoin patchelf; + }) + (import ./nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix { + inherit (pkgs) replaceDependency; + }) + ]; + + inherit (config.packages) + libfabric + openmpi + ; + }; + packages = rec { + aws-ofi-nccl = ( + pkgs.callPackage ./nix/pkgs/aws-ofi-nccl { + inherit inputs self; + } + ); + ubuntu-test-runners = pkgs.callPackage ./nix/ubuntuTestRunners.nix { + nccl-tests = pkgs.pkgsCuda.sm_90.cudaPackages.nccl-tests-aws; + }; + default = aws-ofi-nccl; + inherit (pkgs) + libfabric + openmpi + ; + }; + }; + } + ); + + inputs = { + lib-aggregate.url = "github:nix-community/lib-aggregate"; + nixpkgs.url = "https://flakehub.com/f/DeterminateSystems/nixpkgs-weekly/0.1.678339.tar.gz"; + git-hooks.url = "https://flakehub.com/f/cachix/git-hooks.nix/0.1.928.tar.gz"; + nix-github-actions.url = "github:nix-community/nix-github-actions"; + nix-github-actions.inputs.nixpkgs.follows = "nixpkgs"; + cuda-packages.url = "github:ConnorBaker/cuda-packages"; + cuda-packages.inputs.flake-parts.follows = "flake-parts"; + cuda-packages.inputs.nixpkgs.follows = "nixpkgs"; + cuda-packages.inputs.git-hooks-nix.follows = "git-hooks"; + }; + + nixConfig = { + allowUnfree = true; + cudaSupport = true; + extra-substituters = [ + "https://numtide.cachix.org" + "https://nix-community.cachix.org" + "https://devenv.cachix.org" + "https://cuda-maintainers.cachix.org" + ]; + extra-trusted-public-keys = [ + "numtide.cachix.org-1:2ps1kLBUWjxIneOy1Ik6cQjb41X0iXVXeHigGmycPPE=" + "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs=" + "devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=" + "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=" + ]; + }; +} diff --git a/nix/checks.nix b/nix/checks.nix new file mode 100644 index 000000000..b9556e947 --- /dev/null +++ b/nix/checks.nix @@ -0,0 +1,26 @@ +{ lib }: +{ + hooks = { + nixfmt-rfc-style.enable = true; + clang-format = { + enable = true; + types_or = lib.mkForce [ + "c" + "c++" + ]; + }; + actionlint.enable = true; + check-added-large-files.enable = true; + check-xml.enable = true; + detect-aws-credentials.enable = true; + detect-private-keys.enable = true; + editorconfig-checker.enable = true; + mdl.enable = true; + shfmt.enable = true; + shellcheck.enable = true; + #check-merge-conficts.enable = true; + no-commit-to-branch.enable = true; + forbid-new-submodules.enable = true; + convco.enable = true; + }; +} diff --git a/nix/cudaPackagesExtensions/0001-add-latest-nccl.nix b/nix/cudaPackagesExtensions/0001-add-latest-nccl.nix new file mode 100644 index 000000000..7dd2c3c03 --- /dev/null +++ b/nix/cudaPackagesExtensions/0001-add-latest-nccl.nix @@ -0,0 +1,13 @@ +{ fetchFromGitHub }: +ffinal: pprev: { + nccl_latest = pprev.nccl.overrideAttrs (prevAttrs: { + src = fetchFromGitHub { + owner = "NVIDIA"; + repo = "nccl"; + rev = "v2.23.4-1"; + hash = "sha256-DlMxlLO2F079fBkhORNPVN/ASYiVIRfLJw7bDoiClHw="; + }; + name = "cuda${ffinal.cudaMajorMinorPatchVersion}-nccl-2.23.4-1"; + version = "2.23.4-1"; + }); +} diff --git a/nix/cudaPackagesExtensions/0002-use-latest-nccl.nix b/nix/cudaPackagesExtensions/0002-use-latest-nccl.nix new file mode 100644 index 000000000..076047126 --- /dev/null +++ b/nix/cudaPackagesExtensions/0002-use-latest-nccl.nix @@ -0,0 +1 @@ +ffinal: pprev: { nccl = pprev.nccl_latest; } diff --git a/nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix b/nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix new file mode 100644 index 000000000..961d17e5f --- /dev/null +++ b/nix/cudaPackagesExtensions/0003-nccl-tests-use-mpi.nix @@ -0,0 +1,7 @@ +{ config }: +ffinal: pprev: { + nccl-tests = pprev.nccl-tests.override { + mpiSupport = true; + mpi = config.packages.openmpi; + }; +} diff --git a/nix/cudaPackagesExtensions/0004-add-ncclAws.nix b/nix/cudaPackagesExtensions/0004-add-ncclAws.nix new file mode 100644 index 000000000..f2aec2b85 --- /dev/null +++ b/nix/cudaPackagesExtensions/0004-add-ncclAws.nix @@ -0,0 +1,29 @@ +{ + lib, + config, + symlinkJoin, + patchelf, +}: +ffinal: pprev: { + ncclAws = symlinkJoin { + inherit (pprev.nccl) + name + ; + paths = [ + (ffinal.backendStdenv.mkDerivation { + name = "${pprev.nccl.name}+ofi-nccl-aws"; + src = pprev.nccl.out; + buildPhase = '' + cp -r . $out + ''; + postFixup = '' + ${patchelf}/bin/patchelf --add-rpath ${ + lib.makeLibraryPath [ (lib.getLib config.packages.default) ] + } $out/lib/libnccl.so + ''; + }) + (lib.getLib pprev.nccl) + (lib.getDev pprev.nccl) + ]; + }; +} diff --git a/nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix b/nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix new file mode 100644 index 000000000..d49fd5714 --- /dev/null +++ b/nix/cudaPackagesExtensions/0005-add-nccl-tests-aws.nix @@ -0,0 +1,8 @@ +{ replaceDependency }: +ffinal: pprev: { + nccl-tests-aws = replaceDependency { + drv = ffinal.nccl-tests; + oldDependency = ffinal.nccl; + newDependency = ffinal.ncclAws; + }; +} diff --git a/nix/overlays/libfabric/0001-cuda-dlopen-correctly.patch b/nix/overlays/libfabric/0001-cuda-dlopen-correctly.patch new file mode 100644 index 000000000..e2f87d636 --- /dev/null +++ b/nix/overlays/libfabric/0001-cuda-dlopen-correctly.patch @@ -0,0 +1,82 @@ +From 1c37266a3654e8ea5468ae1b5bef567c5c7753af Mon Sep 17 00:00:00 2001 +From: Nicholas Sielicki +Date: Tue, 26 Nov 2024 11:16:35 -0800 +Subject: [PATCH] cuda-dlopen-correctly + +Signed-off-by: Nicholas Sielicki +--- + fabtests/common/hmem_cuda.c | 8 ++++---- + prov/psm3/psm3/psm.c | 4 ++-- + src/hmem_cuda.c | 8 ++++---- + 3 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c +index 2f02b6f47..435c896fa 100644 +--- a/fabtests/common/hmem_cuda.c ++++ b/fabtests/common/hmem_cuda.c +@@ -151,15 +151,15 @@ int ft_cuda_init(void) + cudaError_t cuda_ret; + int ret; + +- cudart_handle = dlopen("libcudart.so", RTLD_NOW); ++ cudart_handle = dlopen("libcudart.so.12", RTLD_NOW); + if (!cudart_handle) { +- FT_ERR("Failed to dlopen libcudart.so"); ++ FT_ERR("Failed to dlopen libcudart.so.12"); + goto err; + } + +- cuda_handle = dlopen("libcuda.so", RTLD_NOW); ++ cuda_handle = dlopen("libcuda.so.1", RTLD_NOW); + if (!cuda_handle) { +- FT_ERR("Failed to dlopen libcuda.so\n"); ++ FT_ERR("Failed to dlopen libcuda.so.1\n"); + goto err_dlclose_cudart; + } + +diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c +index e46f868f0..9dc1467dc 100644 +--- a/prov/psm3/psm3/psm.c ++++ b/prov/psm3/psm3/psm.c +@@ -295,10 +295,10 @@ int psmi_cuda_lib_load() + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice); + + /* CUDA Runtime */ +- psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY); ++ psmi_cudart_lib = dlopen("libcudart.so.12", RTLD_LAZY); + if (!psmi_cudart_lib) { + dlerr = dlerror(); +- _HFI_ERROR("Unable to open libcudart.so. Error %s\n", ++ _HFI_ERROR("Unable to open libcudart.so.12. Error %s\n", + dlerr ? dlerr : "no dlerror()"); + goto fail; + } +diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c +index 213030411..fb448047a 100644 +--- a/src/hmem_cuda.c ++++ b/src/hmem_cuda.c +@@ -480,17 +480,17 @@ static int cuda_hmem_dl_init(void) + /* Assume failure to dlopen CUDA runtime is caused by the library not + * being found. Thus, CUDA is not supported. + */ +- cuda_attr.runtime_handle = dlopen("libcudart.so", RTLD_NOW); ++ cuda_attr.runtime_handle = dlopen("libcudart.so.12", RTLD_NOW); + if (!cuda_attr.runtime_handle) { + FI_INFO(&core_prov, FI_LOG_CORE, +- "Failed to dlopen libcudart.so\n"); ++ "Failed to dlopen libcudart.so.12\n"); + return -FI_ENOSYS; + } + +- cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW); ++ cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW); + if (!cuda_attr.driver_handle) { + FI_WARN(&core_prov, FI_LOG_CORE, +- "Failed to dlopen libcuda.so\n"); ++ "Failed to dlopen libcuda.so.1\n"); + goto err_dlclose_cuda_runtime; + } + +-- +2.47.0 + diff --git a/nix/overlays/libfabric/default.nix b/nix/overlays/libfabric/default.nix new file mode 100644 index 000000000..2d7e9310a --- /dev/null +++ b/nix/overlays/libfabric/default.nix @@ -0,0 +1,55 @@ +final: prev: { + libfabric = ( + let + joined = ( + final.symlinkJoin { + name = "cuda-build-deps-joined"; + paths = [ + (final.lib.getLib final.cudaPackages.cuda_cudart) + (final.lib.getDev final.cudaPackages.cuda_cudart) + (final.lib.getLib final.cudaPackages.cuda_nvcc) + (final.lib.getDev final.cudaPackages.cuda_nvcc) + (final.lib.getLib final.cudaPackages.cuda_nvml_dev) + (final.lib.getDev final.cudaPackages.cuda_nvml_dev) + ]; + } + ); + in + (prev.libfabric.overrideAttrs (pprev: { + pname = "libfabric-aws"; + src = final.fetchFromGitHub { + owner = "aws"; + repo = "libfabric"; + rev = "v1.22.0amzn4.0"; + hash = "sha256-Y79fwGJQI+AHqWBmydILFGMLTfFdqC6gr59Xnb24Llc="; + }; + patches = [ + ./0001-cuda-dlopen-correctly.patch + ]; + version = "1.22.0-4.0"; + buildInputs = (pprev.buildInputs or [ ]) ++ [ + final.rdma-core + final.cudaPackages.cuda_cudart + final.cudaPackages.cuda_nvcc + final.cudaPackages.cuda_nvml_dev + ]; + configureFlags = (pprev.configureFlags or [ ]) ++ [ + "--enable-efa=yes" + "--with-cuda=${joined}/" + "--enable-cuda-dlopen" + ]; + nativeBuildInputs = (pprev.nativeBuildInputs or [ ]) ++ [ + final.autoAddDriverRunpath + final.autoPatchelfHook + ]; + appendRunpaths = final.lib.makeLibraryPath [ + joined + ]; + })).override + ({ + enableOpx = false; + enablePsm2 = false; + stdenv = final.cudaPackages.backendStdenv; + }) + ); +} diff --git a/nix/pkgs/aws-ofi-nccl/cleanSource.nix b/nix/pkgs/aws-ofi-nccl/cleanSource.nix new file mode 100644 index 000000000..9da6fe133 --- /dev/null +++ b/nix/pkgs/aws-ofi-nccl/cleanSource.nix @@ -0,0 +1,76 @@ +{ lib, self }: +let + inherit (lib.fileset) + intersection + difference + unions + fileFilter + fromSource + toSource + gitTracked + traceVal + ; + inherit (builtins) + any + ; + + dirs = { + third-party = ./../../../3rd-party; + docs = ./../../../doc; + headers = ./../../../include; + mfour = ./../../../m4; + nix = ./../../../nix; + tus = ./../../../src; + tests = ./../../../tests; + topologies = ./../../../topology; + }; + + sourceFilter = fileFilter ( + file: + any file.hasExt [ + "c" + "cc" + "cpp" + "h" + "hpp" + "hh" + "xml" + ] + ); + + buildFileFilter = fileFilter ( + file: + any file.hasExt [ + "in" + "m4" + "ac" + "am" + ] + ); + + cleanRepo = traceVal (gitTracked ../../../.); + cleaned = x: intersection x (gitTracked ../../../.); + sourceFiles = cleaned (sourceFilter ../../../.); + buildFiles = cleaned (buildFileFilter ../../../.); + thirdPartyFiles = cleaned ../../../.; + thirdPartyBuildFiles = unions [ + thirdPartyFiles + buildFiles + ]; + thirdPartySourceFiles = difference [ + thirdPartyFiles + thirdPartyBuildFiles + ]; + + projectSourceFiles = difference [ + sourceFiles + thirdPartySourceFiles + ]; +in +lib.fileset.toSource { + root = ../../../.; + fileset = lib.fileset.unions [ + buildFiles + sourceFiles + ]; +} diff --git a/nix/pkgs/aws-ofi-nccl/default.nix b/nix/pkgs/aws-ofi-nccl/default.nix new file mode 100644 index 000000000..1e4047e9f --- /dev/null +++ b/nix/pkgs/aws-ofi-nccl/default.nix @@ -0,0 +1,173 @@ +{ + lib, + inputs, + self, + fetchFromGitHub, + symlinkJoin, + releaseTools, + stdenv, + config, + libfabric, + hwloc, + perl, + libtool, + autoconf, + automake, + autoreconfHook, + lttng-ust, + valgrind, + mpi, + cudaPackages ? { }, + autoAddDriverRunpath, + neuronSupport ? (!config.cudaSupport), + cudaSupport ? (config.cudaSupport && !neuronSupport), + enableTests ? cudaSupport, + enableTracePrints ? true, + enableLTTNGTracing ? false, + enablePickyCompiler ? true, + enableWerror ? true, + enableNVTXTracing ? false, + enableValgrind ? false, + enableAwsTuning ? true, +}: + +assert neuronSupport != cudaSupport; +assert !enableNVTXTracing || (enableNVTXTracing && cudaSupport); +let + + effectiveStdenv = if cudaSupport then cudaPackages.backendStdenv else stdenv; + + cudaBuildDepsJoined = symlinkJoin { + name = "cuda-build-deps-joined"; + paths = lib.optionals (cudaSupport) ( + [ + (lib.getDev cudaPackages.cuda_nvcc) + cudaPackages.cuda_cudart.include + ] + ++ ( + if effectiveStdenv.hostPlatform.isStatic then + [ + (lib.getOutput "static" cudaPackages.cuda_cudart) + ] + else + [ + (lib.getLib cudaPackages.cuda_cudart) + ] + ) + ); + }; +in +effectiveStdenv.mkDerivation { + name = "aws-ofi-nccl"; + pname = lib.concatStringsSep "" [ + "lib" + (if neuronSupport then "nccom" else "nccl") + "-net-ofi" + (lib.optionalString enableAwsTuning "-aws") + ]; + version = inputs.self.shortRev or inputs.self.dirtyShortRev; + src = import ./cleanSource.nix { + inherit lib; + inherit self; + }; + + nativeBuildInputs = + [ autoreconfHook ] + ++ lib.optionals cudaSupport [ + autoAddDriverRunpath + cudaPackages.cuda_nvcc + ]; + + buildInputs = + [ + libfabric + hwloc + ] + ++ lib.optionals cudaSupport [ + cudaBuildDepsJoined + ] + ++ lib.optionals enableValgrind [ + valgrind + ] + ++ lib.optionals enableTests [ + mpi + ] + ++ lib.optionals enableLTTNGTracing [ + lttng-ust + ]; + + configureFlags = [ + # core deps + (lib.withFeatureAs true "libfabric" (lib.getDev libfabric)) + (lib.withFeatureAs true "hwloc" (lib.getDev hwloc)) + #(lib.withFeatureAs true "nccl-headers" (cudaPackages.nccl.dev)) + + # libs + (lib.withFeatureAs enableTests "mpi" (lib.getDev mpi)) + (lib.enableFeature enableTests "tests") + (lib.withFeatureAs enableLTTNGTracing "lttng" (lib.getDev lttng-ust)) + (lib.withFeatureAs enableValgrind "valgrind" (lib.getDev valgrind)) + + # accelerator support + (lib.enableFeature neuronSupport "neuron") + (lib.withFeatureAs cudaSupport "cuda" cudaBuildDepsJoined) + (lib.withFeatureAs (enableNVTXTracing && cudaSupport) "nvtx" (lib.getDev cudaPackages.cuda_nvtx)) + (lib.enableFeature (!effectiveStdenv.hostPlatform.isStatic) "cudart-dynamic") + + # build configuration + (lib.enableFeature enableAwsTuning "platform-aws") + (lib.enableFeature enablePickyCompiler "picky-compiler") + (lib.enableFeature enableWerror "werror") + (lib.enableFeature enableTracePrints "trace") + ]; + + meta = with lib; { + homepage = "https://github.com/aws/aws-ofi-nccl"; + license = licenses.asl20; + broken = (cudaSupport && !config.cudaSupport); + maintainers = with maintainers; [ sielicki ]; + platforms = [ + "x86_64-linux" + "aarch64-linux" + ]; + }; + + hardeningEnable = [ + "format" + "fortify3" + "shadowstack" + "pacret" + "pic" + "pie" + "stackprotector" + "stackclashprotection" + "strictoverflow" + "trivialautovarinit" + ]; + enableParallelBuilding = true; + separateDebugInfo = true; + strictDeps = true; + + outputs = [ + "dev" + "out" + ] ++ lib.optionals enableTests [ "bin" ]; + postInstall = '' + find $out | grep -E \.la$ | xargs rm + mkdir -p $dev/nix-support/generated-headers/include && cp include/config.h $dev/nix-support/generated-headers/include/ + cp config.log $dev/nix-support/config.log + ''; + + doCheck = enableTests; + checkPhase = '' + set -euo pipefail + for test in $(find tests/unit/ -type f -executable -print | xargs) ; do + echo "======================================================================" + echo "Running $test" + ./$test + test $? -eq 0 && (echo "✅ Passed" || (echo "❌ Failed!" && exit 1)) + done + echo "All unit tests passed successfully." + set +u + ''; +} diff --git a/nix/pkgs/ncclWithExtNet.nix b/nix/pkgs/ncclWithExtNet.nix new file mode 100644 index 000000000..3b2c84854 --- /dev/null +++ b/nix/pkgs/ncclWithExtNet.nix @@ -0,0 +1,27 @@ +{ + lib, + stdenv, + patchelf, + symlinkJoin, + nccl, + plugin, +}: +symlinkJoin { + name = "${nccl.name}-${plugin.name}-joined"; + paths = [ + (stdenv.mkDerivation { + name = "${nccl.name}+net-${plugin.name}"; + src = nccl.out; + buildPhase = '' + cp -r . $out + ''; + postFixup = '' + ${patchelf}/bin/patchelf --add-rpath ${ + lib.makeLibraryPath [ (lib.getLib plugin) ] + } $out/lib/libnccl.so + ''; + }) + (lib.getLib nccl) + (lib.getDev nccl) + ]; +} diff --git a/nix/shell.nix b/nix/shell.nix new file mode 100644 index 000000000..0df25bf30 --- /dev/null +++ b/nix/shell.nix @@ -0,0 +1,203 @@ +{ + self, + config, + system, + inputs, + pkgs, +}: +let + source-dir = builtins.getEnv "PWD"; + + clang-format-file = pkgs.writeTextFile { + name = "clang-format-config"; + text = pkgs.lib.generators.toYAML { } { + AlignConsecutiveAssignments = false; + AlignConsecutiveBitFields = { + AcrossComments = true; + AcrossEmptyLines = true; + Enabled = true; + }; + AlignConsecutiveDeclarations = false; + AlignConsecutiveMacros = { + AcrossComments = true; + AcrossEmptyLines = true; + Enabled = true; + }; + AlignConsecutiveShortCaseStatements = { + AcrossComments = true; + AcrossEmptyLines = true; + AlignCaseColons = false; + Enabled = true; + }; + AlignOperands = "Align"; + AlignTrailingComments = { + Kind = "Always"; + OverEmptyLines = 0; + }; + AllowShortCompoundRequirementOnASingleLine = true; + KeepEmptyLines = { + AtEndOfFile = false; + AtStartOfBlock = false; + AtStartOfFile = false; + }; + AllowAllArgumentsOnNextLine = false; + AllowShortFunctionsOnASingleLine = "None"; + AllowShortIfStatementsOnASingleLine = false; + AllowShortLoopsOnASingleLine = false; + BasedOnStyle = "Google"; + BinPackArguments = false; + BinPackParameters = false; + BracedInitializerIndentWidth = 8; + BreakBeforeBraces = "Linux"; + ColumnLimit = 130; + ContinuationIndentWidth = 8; + IncludeBlocks = "Regroup"; + IncludeCategories = [ + { + Priority = -40; + Regex = "^([\"]config[.]h[\"])$"; + SortPriority = -40; + } + { + Priority = 5; + Regex = "^[<](rdma/|uthash/|nccl/|mpi|hwloc/|lttng/|valgrind/|cuda).*[.]h[>]$"; + SortPriority = 5; + } + { + Priority = 10; + Regex = "^([\"]nccl.*[.]h[\"])$"; + SortPriority = 10; + } + ]; + IndentCaseLabels = false; + IndentWidth = 8; + InsertBraces = true; + InsertNewlineAtEOF = true; + LineEnding = "LF"; + MaxEmptyLinesToKeep = 2; + PointerAlignment = "Right"; + ReferenceAlignment = "Right"; + ReflowComments = true; + RemoveParentheses = "MultipleParentheses"; + SortIncludes = "CaseSensitive"; + SpacesBeforeTrailingComments = 2; + TabWidth = 8; + BreakBinaryOperations = "RespectPrecedence"; + AllowShortCaseExpressionOnASingleLine = true; + UseTab = "ForContinuationAndIndentation"; + }; + }; + + editorconfig-file = pkgs.writeTextFile { + name = "editorconfig-config"; + text = pkgs.lib.generators.toINIWithGlobalSection { } { + globalSection = { + root = true; + }; + sections = { + "*" = { + trim_trailing_whitespace = true; + charset = "utf-8"; + end_of_line = "lf"; + insert_final_newline = true; + }; + "*.am" = { + indent_size = 8; + indent_style = "tab"; + }; + "*.md" = { + indent_size = 2; + indent_style = "space"; + }; + "*.nix" = { + tab_width = 4; + indent_size = 2; + indent_style = "space"; + }; + "*.{c|h|cc|hh|cu}" = { + tab_width = 8; + indent_size = 8; + indent_style = "tab"; + }; + }; + }; + }; + + clangd-file = pkgs.writeTextFile { + name = "clangd-config"; + text = pkgs.lib.generators.toYAML { } { + CompileFlags = { + Add = [ + "-Wall" + "-Wextra" + "-Wformat" + "-xc++" + "-std=c++23" + "-isystem${pkgs.glibc_multi.dev}/include/" + "-isystem${pkgs.hwloc.dev}/include/" + "-isystem${pkgs.cudaPackages.cuda_cudart.dev}/include/" + "-isystem${pkgs.cudaPackages.cuda_nvtx.dev}/include/" + "-isystem${config.packages.libfabric.dev}/include/" + "-isystem${config.packages.openmpi.dev}/include/" + "-I${config.packages.default}/nix-support/generated-headers/include/" + "-I${source-dir}/include/" + "-I${source-dir}/3rd-party/nccl/cuda/include/" + ]; + }; + Diagnostics = { + ClangTidy = { + CheckOptions = { + "cppcoreguidelines-avoid-magic-numbers.IgnoreTypeAliases" = true; + "readability-magic-numbers.IgnoreTypeAliases" = true; + }; + }; + Includes = { + IgnoreHeader = [ + "hwloc.h" + "config.h" + ]; + }; + }; + }; + }; + clionConfigureFlags = pkgs.writeTextFile { + name = ".configureFlags"; + text = pkgs.lib.concatStringsSep " " config.packages.default.configureFlags; + }; +in +pkgs.mkShell { + inputsFrom = [ + self.packages.${system}.aws-ofi-nccl + config.packages.libfabric + config.packages.openmpi + ]; + packages = [ + #pkgs.llvmPackages_git.clang-analyzer + pkgs.llvmPackages_git.clang-tools + pkgs.llvmPackages_git.clang + pkgs.gcc + pkgs.gdb + pkgs.include-what-you-use + pkgs.llvmPackages_git.libclang.python + + pkgs.ccache + pkgs.cppcheck + pkgs.universal-ctags + pkgs.act + pkgs.actionlint + + pkgs.gh + pkgs.git + pkgs.eksctl + pkgs.awscli2 + + pkgs.nixfmt-rfc-style + ]; + shellHook = '' + rm -f ${source-dir}/.clangd && ln -s ${clangd-file} ${source-dir}/.clangd + rm -f ${source-dir}/.editorconfig && ln -s ${editorconfig-file} ${source-dir}/.editorconfig + rm -f ${source-dir}/.clang-format && ln -s ${clang-format-file} ${source-dir}/.clang-format + rm -f ${source-dir}/.clion-configure-flags && ln -s ${clionConfigureFlags} ${source-dir}/.clion-configure-flags + ${config.pre-commit.installationScript} + ''; +} diff --git a/nix/ubuntuTestRunners.nix b/nix/ubuntuTestRunners.nix new file mode 100644 index 000000000..1fc120cf3 --- /dev/null +++ b/nix/ubuntuTestRunners.nix @@ -0,0 +1,45 @@ +{ + config, + lib, + symlinkJoin, + writeShellScriptBin, + openmpi, + libfabric, + nccl-tests, +}: +let + tests = [ + "all_gather" + "all_reduce" + "alltoall" + "broadcast" + "gather" + "hypercube" + "reduce" + "reduce_scatter" + "scatter" + "sendrecv" + ]; + ubuntuLibs = [ + "/usr/lib/x86_64-linux-gnu/libcuda.so.1" + "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1" + "/usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1" + ]; + libPathLibs = [ + config.packages.default + openmpi + libfabric + ]; + makeNcclTestRunner = + collName: + writeShellScriptBin "${collName}_perf" '' + LD_PRELOAD="${lib.concatStringsSep ":" ubuntuLibs}" \ + NCCL_TUNER_PLUGIN=libnccl-ofi-tuner.so \ + exec ${lib.getExe' nccl-tests "${collName}_perf"} $@ + ''; + runners = builtins.map makeNcclTestRunner tests; +in +symlinkJoin { + name = "ubuntu-nccl-tests-wrappers"; + paths = runners; +}