Skip to content

Commit

Permalink
Moe converters (#5)
Browse files Browse the repository at this point in the history
* Converters ready

* Added xglm transformers implementation

---------

Co-authored-by: Negar Foroutan <negar.foroutan@epfl.ch>
  • Loading branch information
AleHD and negar-foroutan authored Oct 19, 2024
1 parent 328b8c2 commit 3bce1f4
Show file tree
Hide file tree
Showing 6 changed files with 1,596 additions and 1 deletion.
5 changes: 5 additions & 0 deletions examples/xglm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,8 @@ cd examples/xglm
torchrun --nproc-per-node=1 convert_dense2moe.py --checkpoint-path=checkpoints/xglm-564M --save-path=$SCRATCH/checkpoints/xglm-8x564M --num-experts=8
```
Note that this upcycling _drops_ the bias parameters of the MLP because the MegaBlocks implementation does not support bias parameters. While this is a limitation of the current implementation, the performance is quickly recovered after a few training steps.

To save back to huggingface format use
```bash
torchrun examples/xglm/convert_ntmoe2hf.py --checkpoint-path=$SCRATCH/checkpoints/xglm-8x564M --save-path=$SCRATCH/checkpoints/huggingface/xglm-8x56fM
```
140 changes: 140 additions & 0 deletions examples/xglm/convert_ntmoe2hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""
Converts a nanotron moe model to HF format
Command:
torchrun --nproc-per-node=1 convert_nt2hf.py --checkpoint-path=nanotron_weights --save-path=hf_weights
"""

import warnings
from argparse import ArgumentParser
from pathlib import Path
from typing import Optional

import torch
from transformers import AutoTokenizer
from tqdm import tqdm

from nanotron.config.models_config import GPT3MoEConfig
from nanotron.models.gpt3_moe import GPT3MoEForTraining, GPT3MoEBlock
from nanotron.models.moe import dMoE, SparseMLP, LearnedRouter

from examples.xglm.convert_dense2moe import create_nt_moe_model
from examples.xglm.convert_nt2hf import convert_attention
from examples.xglm.convert_utils import convert_generic
from examples.xglm.transformers_impl.xglm_model import XGLMForCausalLM, XGLMDecoderLayer, XGLMmoeConfig, XGLMSparseMoeBlock, XGLMMLP
from examples.xglm.transformers_impl.gating import BasicGate


def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig:
if config.embd_pdrop != config.resid_pdrop:
warnings.warn(
f"nanotron.embd_pdrop = {config.embd_pdrop} does not match with "
f"nanotron.resid_pdrop = {config.resid_pdrop}. "
"XGLM implementation needs these two values to be equal "
"for correct conversion."
)
if config.layer_norm_epsilon != 1e-5:
warnings.warn(f"nanotron.layer_norm_epsilon must be 1e-5, not {config.layer_norm_epsilon}")
if config.moe_z_loss_weight != 0:
warnings.warn(f"transformer implementation does not support z loss")
assert not config.moe_glu, "Transformer implementation does not support glu MLP layers"

return XGLMmoeConfig(
# Regular xglm config.
activation_function=config.activation_function,
attention_dropout=config.attn_pdrop,
dropout=config.embd_pdrop,
eos_token_id=config.eos_token_id,
d_model=config.hidden_size,
ffn_dim=config.intermediate_size,
max_position_embeddings=config.max_position_embeddings,
attention_heads=config.num_attention_heads,
num_layers=config.num_hidden_layers,
vocab_size=config.vocab_size,
decoder_start_token_id=config.position_embedding_offset,
activation_dropout=config.act_pdrop,
scale_embedding=config.scale_embedding,
# Moe specifics.
num_local_experts=config.moe_num_experts,
num_experts_per_tok=config.num_experts_per_tok,
gate_type="linear",
gate_depth=1,
router_aux_loss_coef=config.moe_loss_weight,
)


def convert_mlp(mlp_hf: XGLMMLP, mlp_nt: SparseMLP):
convert_generic(mlp_hf.fc1, mlp_nt.w1.module)
convert_generic(mlp_hf.fc2, mlp_nt.w2.module)


def convert_gate(gate_hf: BasicGate, gate_nt: LearnedRouter):
convert_generic(gate_hf.gate, gate_nt.layer)


def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE):
convert_gate(ff_hf.gate, ff_nt.gate)
int_size = ff_nt.config.intermediate_size
if len(ff_hf.experts) == 1:
assert ff_nt.experts.mlp.w1.module.weight.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size)
assert ff_nt.experts.mlp.w2.module.weight.shape == (ff_nt.config.hidden_size, int_size*len(ff_hf.experts))
else:
assert ff_nt.experts.mlp.w1.module.weight.T.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size)
assert ff_nt.experts.mlp.w2.module.weight.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size)

for i, expert_hf in enumerate(ff_hf.experts):
i0 = i*int_size
i1 = (i + 1)*int_size
with torch.no_grad():
if len(ff_hf.experts) == 1:
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight[i0:i1, :].clone())
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[:, i0:i1].clone())
else:
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight.T[i0:i1, :].clone())
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[i0:i1, :].T.clone())

def convert_decoder(block_hf: XGLMDecoderLayer, block_nt: GPT3MoEBlock):
convert_generic(block_hf.self_attn_layer_norm, block_nt.ln_1)
convert_attention(block_hf.self_attn, block_nt.attn)
convert_generic(block_hf.final_layer_norm, block_nt.ln_2)
convert_ff(block_hf.block_sparse_moe, block_nt.ff)


def convert(model_hf: XGLMForCausalLM, model_nt: GPT3MoEForTraining):
convert_generic(model_hf.model.embed_tokens, model_nt.model.token_embeddings.pp_block.token_embedding)
for layer_hf, layer_nt in tqdm(zip(model_hf.model.layers, model_nt.model.decoder), desc="Converting layers",
total=model_nt.config.num_hidden_layers):
convert_decoder(layer_hf, layer_nt.pp_block)
convert_generic(model_hf.model.layer_norm, model_nt.model.final_layer_norm.pp_block)
convert_generic(model_hf.lm_head, model_nt.model.lm_head.pp_block)


def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]):
# Load nanotron model.
model_nt = create_nt_moe_model(checkpoint_path=checkpoint_path)

# Init huggingface model.
model_config_hf = convert_config(model_nt.config)
model_hf = XGLMForCausalLM._from_config(model_config_hf)

# Copy weights, initialize tokenizer and save model.
if tokenizer_name is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.save_pretrained(save_path)
states = torch.randn(4, 1, 1024)
convert(model_hf, model_nt), states.cuda().bfloat16()
print("Saving...")
model_hf.save_pretrained(save_path)
print(f"Model saved to {save_path}")


if __name__ == "__main__":
parser = ArgumentParser(description="Convert HF weights to nanotron format")
parser.add_argument(
"--checkpoint-path", type=Path, default="checkpoints/xglm-7.5B", help="Path to the nanotron checkpoint"
)
parser.add_argument(
"--save-path", type=Path, default="facebook/xglm-7.5B", help="Path to save the huggingface model"
)
parser.add_argument("--tokenizer-name", type=str, default="facebook/xglm-7.5B")
args = parser.parse_args()
ret = main(args.checkpoint_path, args.save_path, args.tokenizer_name)
182 changes: 182 additions & 0 deletions examples/xglm/tests/test_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import torch
import pytest

import nanotron
from nanotron.config.parallelism_config import ParallelismArgs
from nanotron.config.models_config import GPT3MoEConfig
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.trainer import mark_tied_parameters
from nanotron.models.gpt3_moe import GPT3MoEBlock, GPT3MoEForTraining
from nanotron.models.moe import LearnedRouter, dMoE

from tests.helpers.utils import init_distributed

from examples.xglm.convert_ntmoe2hf import convert_config, convert_gate, convert_ff, convert
from examples.xglm.tests.test_implementation import almost_close
from examples.xglm.transformers_impl.xglm_model import XGLMSparseMoeBlock, XGLMForCausalLM
from examples.xglm.transformers_impl.gating import BasicGate


MAX_SEQUENCE_LENGTH = 2048
TEST_SEQUENCE_LENGTH = 128 # If we test with a very large sequence length, precision errors get more significant independent of the correct implementation.
#TEST_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH
BATCH_SIZE = 4
HIDDEN_SIZE = 1024
#DTYPE = torch.bfloat16
DTYPE = torch.float32
TEXT = "Hello. This is a relatively long text. I will use this text to test the conversion scripts. Let's finish this text soon because I don't have much more to say. Final note:"

CONFIG = GPT3MoEConfig(
attn_pdrop=0.0,
embd_pdrop=0.0,
resid_pdrop=0.0,
act_pdrop=0.0,
eos_token_id=2,
hidden_size=HIDDEN_SIZE,
intermediate_size=4096,
layer_norm_epsilon=1e-05,
max_position_embeddings=MAX_SEQUENCE_LENGTH,
num_attention_heads=16,
num_hidden_layers=24,
scale_attn_weights=True,
vocab_size=256008,
sinusoidal_position_embedding=True,
position_embedding_offset=2,
use_spda=DTYPE is not torch.bfloat16,
# vvv moe vvv
is_moe=True,
moe_num_experts=8,
num_experts_per_tok=2,
moe_loss_weight=0.01,
moe_z_loss_weight=0.0,
moe_glu=False,
)
PARALLEL_CONFIG = ParallelismArgs(dp=1, pp=1, tp=1, expert_parallel_size=1) #CONFIG.moe_num_experts)


@pytest.fixture
def hidden_states() -> torch.Tensor:
return torch.randn(TEST_SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE)


@pytest.fixture
def input_mask() -> torch.Tensor:
return torch.ones(BATCH_SIZE, TEST_SEQUENCE_LENGTH, dtype=torch.bool)


@pytest.fixture
def input_ids() -> torch.Tensor:
return torch.randint(0, CONFIG.vocab_size, (BATCH_SIZE, TEST_SEQUENCE_LENGTH))


def _test_nt2hf_gate(parallel_context: ParallelContext, hidden_states: torch.Tensor):
hidden_states = hidden_states.cuda()

config_hf = convert_config(CONFIG)
gate_nt = LearnedRouter(CONFIG).cuda().to(DTYPE)
gate_hf = BasicGate(config_hf).cuda().to(DTYPE)
convert_gate(gate_hf, gate_nt)

router_logits_nt, _, _ = gate_nt(hidden_states.view(-1, HIDDEN_SIZE))
router_logits_hf = gate_hf(hidden_states.permute(1, 0, 2).reshape(-1, HIDDEN_SIZE), "")

router_logits_nt = router_logits_nt.view(TEST_SEQUENCE_LENGTH, BATCH_SIZE, -1)
router_logits_hf = router_logits_hf.view(BATCH_SIZE, TEST_SEQUENCE_LENGTH, -1).permute(1, 0, 2)

assert router_logits_nt.size() == router_logits_hf.size()
torch.testing.assert_close(router_logits_nt, router_logits_hf)


def test_nt2hf_gate(hidden_states: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_gate)(hidden_states=hidden_states)


def _test_nt2hf_ff(parallel_context: ParallelContext, hidden_states: torch.Tensor,
num_experts: int, num_experts_per_tok: int):
hidden_states = hidden_states.cuda()

config = {**vars(CONFIG)}
config.update({"moe_num_experts": num_experts, "num_experts_per_tok": num_experts_per_tok})
config = GPT3MoEConfig(**config)
config_hf = convert_config(config)
ff_nt = dMoE(config, parallel_context, PARALLEL_CONFIG).cuda().to(DTYPE)
ff_hf = XGLMSparseMoeBlock(config_hf).cuda().to(DTYPE)
convert_ff(ff_hf, ff_nt)

out_nt = ff_nt(hidden_states)["hidden_states"]
out_hf, _ = ff_hf(hidden_states.permute(1, 0, 2).contiguous(), "")
out_hf = out_hf.permute(1, 0, 2)

assert out_nt.size() == out_hf.size()
almost_close(out_nt, out_hf, max_far=0.05, far_atol=0.003)


@pytest.mark.parametrize("num_experts,num_experts_per_tok", [(1, 1), (2, 1), (4, 1), (4, 2), (8, 1), (8, 2), (8, 4)])
def test_nt2hf_ff(hidden_states: torch.Tensor, num_experts: int, num_experts_per_tok: int):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_ff)(hidden_states=hidden_states, num_experts=num_experts, num_experts_per_tok=num_experts_per_tok)


def _test_nt2hf_model(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
random_states = nanotron.random.RandomStates({"tp_synced": nanotron.random.get_current_random_state()})
input_ids = input_ids.cuda()
input_mask = input_mask.cuda()

# unfortunately, we can't use float64 with huggingface xglm.
new_dtype = torch.float32 if DTYPE == torch.float64 else DTYPE

# Get nanotron model.
config_nt = GPT3MoEConfig(**vars(CONFIG))
if new_dtype not in {torch.bfloat16, torch.float16}:
config_nt.use_spda = True
model_nt = nanotron.models.build_model(
model_builder=lambda: GPT3MoEForTraining(
config=config_nt,
parallel_context=parallel_context,
parallel_config=None,
random_states=random_states,
),
parallel_context=parallel_context,
dtype=new_dtype,
device="cuda",
).eval()
mark_tied_parameters(model=model_nt, parallel_context=parallel_context)

# Create empty model_hf and make conversion.
model_hf = XGLMForCausalLM(convert_config(config_nt)).cuda().to(new_dtype).eval()
convert(model_hf, model_nt)

# Needed :/
aux_losses = {
"load_balancing_loss": (
torch.zeros(1, device=input_ids.device)
if not isinstance(input_ids, TensorPointer)
else TensorPointer(self.input_pp_rank)
),
"z_loss": (
torch.zeros(1, device=input_ids.device)
if not isinstance(input_ids, TensorPointer)
else TensorPointer(self.input_pp_rank)
),
}

# Get outputs and assert.
with torch.no_grad():
out_nt = model_nt.model(input_ids, input_mask, aux_losses)["sharded_logits"].to(new_dtype)
del model_nt
torch.cuda.empty_cache()
out_hf = model_hf(input_ids=input_ids, attention_mask=input_mask, output_router_logits=False).logits.permute(1, 0, 2)
del model_hf
torch.cuda.empty_cache()
assert out_nt.size() == out_hf.size(), f"{out_nt.size()}, {out_hf.size()}"
return out_nt.cpu(), out_hf.cpu()


def _test_nt2hf_dummy_xglm(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
out_nt, out_hf = _test_nt2hf_model(parallel_context, input_ids, input_mask)
almost_close(out_nt, out_hf, max_far=0.01, far_atol=2.0) # We allow for less than 1% errors, but some of these are very large!
#torch.testing.assert_close(out_nt.bfloat16(), out_hf.bfloat16())


def test_nt2hf_dummy_xglm(input_ids: torch.Tensor, input_mask: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_dummy_xglm)(input_ids=input_ids, input_mask=input_mask)
Loading

0 comments on commit 3bce1f4

Please sign in to comment.