From 62de0b2e4b72bbc81b85c3ce9d36a877a0a81e46 Mon Sep 17 00:00:00 2001 From: Martin Jaggi Date: Wed, 29 Nov 2023 21:39:56 +0100 Subject: [PATCH] minor consistency edit --- AUTHORS | 1 + README.md | 2 +- docs/index.rst | 9 +++++---- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/AUTHORS b/AUTHORS index d03a9dc..61af51a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -11,5 +11,6 @@ Kyle Matoba, Idiap Research Institute and EPFL Amirkeivan Mohtashami, EPFL Matteo Pagliardini, EPFL Francesco Salvi, +Xingyao Wang diff --git a/README.md b/README.md index b271898..e36b8fe 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ If you use this software please cite it: Francesco Salvi and Antoine Bosselut and Martin Jaggi}, - title = {epfLLM Megatron-LM}, + title = {epfLLM Megatron-LLM}, year = 2023, url = {https://github.com/epfLLM/Megatron-LLM} } diff --git a/docs/index.rst b/docs/index.rst index 0c52873..7be3d3e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,14 +8,14 @@ Our repository is a modification of the `original Megatron-LM codebase `_, `LLaMa 2 `_, `Falcon `_, `Code Llama `_ `Mistral https://arxiv.org/abs/2310.06825`_ support. -- support training of large models (70B Llama 2, 65B Llama 1, 34B Code Llama, and 40B Falcon) on commodity hardware on multiple nodes +- architectures supported: `LLaMa `_, `LLaMa 2 `_, `Falcon `_, `Code Llama `_ and `Mistral https://arxiv.org/abs/2310.06825`_. +- support training of large models (70B Llama 2, 65B Llama 1, 34B Code Llama, 40B Falcon and Mistral) on commodity hardware on multiple nodes - 3-way parallelism: tensor parallel, pipeline parallel and data parallel training (inherited from Megatron) - full pretraining, finetuning and instruct tuning support - Support for special tokens & tokenizers - grouped-query attention (GQA) and multi-query attention (MQA) - Rotary Position Embeddings (RoPE), RMS layer norm, Lima dropout -- `ROPE scaling `_ for longer attention context support +- `RoPE scaling `_ for longer attention context support - FlashAttention 2 - BF16 / FP16 training - WandB integration @@ -61,6 +61,7 @@ If you use this software please cite it: Andreas Köpf and Kyle Matoba and Amirkeivan Mohtashami and + Xingyao Wang and Olivia Simin Fan and Axel Marmet and Deniz Bayazit and @@ -69,7 +70,7 @@ If you use this software please cite it: Francesco Salvi and Antoine Bosselut and Martin Jaggi}, - title = {epfLLM Megatron-LM}, + title = {epfLLM Megatron-LLM}, year = 2023, url = {https://github.com/epfLLM/Megatron-LLM} }