From b41f0f3502793c602aa6f1ef50a05754cf084311 Mon Sep 17 00:00:00 2001 From: kisate Date: Wed, 30 Oct 2024 03:52:25 +0000 Subject: [PATCH 01/45] hf vs nanotron idefics3 --- .gitignore | 2 + src/nanotron/config/models_config.py | 3 +- src/nanotron/models/idefics.py | 11 +- tools/idefics3/build_nanotron_from_hf.py | 488 ++++++++++++++++++ .../convert_hf_to_nanotron.py | 106 ++-- tools/idefics3/generate_hf_predictions.py | 209 ++++++++ .../generate_nanotron_predictions.py | 11 +- 7 files changed, 772 insertions(+), 58 deletions(-) create mode 100644 tools/idefics3/build_nanotron_from_hf.py rename tools/{idefics2 => idefics3}/convert_hf_to_nanotron.py (85%) create mode 100644 tools/idefics3/generate_hf_predictions.py rename tools/{idefics2 => idefics3}/generate_nanotron_predictions.py (95%) diff --git a/.gitignore b/.gitignore index cbc04eaf..0037d9ae 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,5 @@ cython_debug/ checkpoints/ wandb/ + +nanotron-ckpt/* \ No newline at end of file diff --git a/src/nanotron/config/models_config.py b/src/nanotron/config/models_config.py index dc115305..4222395d 100644 --- a/src/nanotron/config/models_config.py +++ b/src/nanotron/config/models_config.py @@ -163,7 +163,8 @@ class Idefics3Config: vision_config: Idefics3VisionConfig llama_config: LlamaConfig - image_token_id: int = 32001 + image_token_id: int = 128257 + pad_token_id: int = 128_002 scale_factor: int = 2 diff --git a/src/nanotron/models/idefics.py b/src/nanotron/models/idefics.py index e2ae47f4..447d70f8 100644 --- a/src/nanotron/models/idefics.py +++ b/src/nanotron/models/idefics.py @@ -567,17 +567,13 @@ def __init__( self.input_size = hidden_size * (config.scale_factor ** 2) self.output_size = hidden_size - first_contiguous_chunks = ( - self.input_size, # shape of up_linear - ) self.proj = TensorParallelColumnLinear( self.input_size, - hidden_size, + self.output_size, pg=tp_pg, mode=tp_mode, bias=False, async_communication=tp_linear_async_communication, - contiguous_chunks=first_contiguous_chunks, tp_recompute_allgather=parallel_config.tp_recompute_allgather, ) @@ -604,6 +600,7 @@ def __init__( def pixel_shuffle(self, x, scale_factor=2): bsz, seq, embed_dim = x.size() height = width = int(seq**0.5) + x = x.view(bsz, height, width, embed_dim) x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor) x = x.permute(0, 2, 1, 3) @@ -714,6 +711,10 @@ def forward( patch_attention_mask=patch_attention_mask, )["hidden_states"] + print( + image_hidden_states.shape, + ) + # Modality projection & resampling image_hidden_states = self.connector( hidden_states=image_hidden_states diff --git a/tools/idefics3/build_nanotron_from_hf.py b/tools/idefics3/build_nanotron_from_hf.py new file mode 100644 index 00000000..d71121fb --- /dev/null +++ b/tools/idefics3/build_nanotron_from_hf.py @@ -0,0 +1,488 @@ +""" +torchrun --nproc-per-node 1 tools/idefics3/build_nanotron_from_hf.py --nanotron-checkpoint-path nanotron-ckpt --pretrained-model-name-or-path-llama3 meta-llama/Meta-Llama-3-8B-Instruct --pretrained-model-name-or-path-siglip google/siglip-base-patch16-224 +""" +import sys +sys.path.append('.venv/lib/python3.10/site-packages') + +import argparse +from dataclasses import asdict +import json +from pathlib import Path +import torch +from tqdm import tqdm +import yaml +from nanotron import logging +from nanotron.config.config import Config, GeneralArgs, LoggingArgs, ModelArgs, TokenizerArgs +from nanotron.config.models_config import ExistingCheckpointInit, Idefics3VisionConfig, Idefics3Config +from nanotron.config.parallelism_config import ParallelismArgs +from nanotron.logging import log_rank, set_ranks_logging_level +from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron +from nanotron.models.base import build_model +from nanotron.models.idefics import Idefics3ForTraining, VisionTransformer +from nanotron.parallel.context import ParallelContext +from nanotron.parallel.parameters import sanity_check +from nanotron.serialize.weights import save_weights +from nanotron.trainer import mark_tied_parameters +# from tools.llama3.convert_hf_to_nanotron import copy_weights_from_hf_to_nanotron as copy_weights_from_hf_to_nanotron_llama +# from tools.llama3.convert_hf_to_nanotron import nanotron_config_from_hf_config as nanotron_config_from_hf_config_llama + +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel + + +def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_llama_config): + # Copy params from HF to Nanotron + log_rank("Copying weights from HF model to Nanotron model...", logger=logger, level=logging.INFO, rank=0) + # Token embeddings + log_rank("Copying Token Embeddings...", logger=logger, level=logging.INFO, rank=0) + assert ( + nanotron_model.token_position_embeddings.pp_block.token_embedding.weight.shape + == hf_model.model.embed_tokens.weight.shape + ) + with torch.no_grad(): + nanotron_model.token_position_embeddings.pp_block.token_embedding.weight.copy_( + hf_model.model.embed_tokens.weight + ) + + # Decoder layers + for i in tqdm( + range(nanotron_llama_config.num_hidden_layers), + desc="Copying Hidden Layers", + total=nanotron_llama_config.num_hidden_layers, + ): + # Input layer norm + assert ( + hf_model.model.layers[i].input_layernorm.weight.shape + == nanotron_model.decoder[i].pp_block.input_layernorm.weight.shape + ) + with torch.no_grad(): + nanotron_model.decoder[i].pp_block.input_layernorm.weight.copy_( + hf_model.model.layers[i].input_layernorm.weight + ) + + # Self attn + ## QKV + tmp_qkv_proj = torch.cat( + [ + hf_model.model.layers[i].self_attn.q_proj.weight, + hf_model.model.layers[i].self_attn.k_proj.weight, + hf_model.model.layers[i].self_attn.v_proj.weight, + ], + dim=0, + ) + assert tmp_qkv_proj.shape == nanotron_model.decoder[i].pp_block.attn.qkv_proj.weight.shape + with torch.no_grad(): + nanotron_model.decoder[i].pp_block.attn.qkv_proj.weight.copy_(tmp_qkv_proj) + + ## O + assert ( + hf_model.model.layers[i].self_attn.o_proj.weight.shape + == nanotron_model.decoder[i].pp_block.attn.o_proj.weight.shape + ) + with torch.no_grad(): + nanotron_model.decoder[i].pp_block.attn.o_proj.weight.copy_( + hf_model.model.layers[i].self_attn.o_proj.weight + ) + + # MLP + ## Gate Up Proj + tmp_gate_up_proj = torch.cat( + [ + hf_model.model.layers[i].mlp.gate_proj.weight, + hf_model.model.layers[i].mlp.up_proj.weight, + ], + dim=0, + ) + + assert tmp_gate_up_proj.shape == nanotron_model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape + with torch.no_grad(): + nanotron_model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_(tmp_gate_up_proj) + + ## Down Proj + assert ( + hf_model.model.layers[i].mlp.down_proj.weight.shape + == nanotron_model.decoder[i].pp_block.mlp.down_proj.weight.shape + ) + with torch.no_grad(): + nanotron_model.decoder[i].pp_block.mlp.down_proj.weight.copy_( + hf_model.model.layers[i].mlp.down_proj.weight + ) + + # Post attn layer norm + assert ( + hf_model.model.layers[i].post_attention_layernorm.weight.shape + == nanotron_model.decoder[i].pp_block.post_attention_layernorm.weight.shape + ) + with torch.no_grad(): + nanotron_model.decoder[i].pp_block.post_attention_layernorm.weight.copy_( + hf_model.model.layers[i].post_attention_layernorm.weight + ) + + # Last layer norm + log_rank("Copying Final Layer Norm...", logger=logger, level=logging.INFO, rank=0) + assert nanotron_model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape + with torch.no_grad(): + nanotron_model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight) + + # LM_Head + log_rank("Copying LM Head...", logger=logger, level=logging.INFO, rank=0) + assert nanotron_model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape + with torch.no_grad(): + nanotron_model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight) + +def nanotron_config_from_hf_config_llama(hf_config): + return LlamaConfigNanotron( + bos_token_id=hf_config.bos_token_id, + eos_token_id=hf_config.eos_token_id, + hidden_act=hf_config.hidden_act, + hidden_size=hf_config.hidden_size, + initializer_range=hf_config.initializer_range, + intermediate_size=hf_config.intermediate_size, + is_llama_config=True, + max_position_embeddings=hf_config.max_position_embeddings, + num_attention_heads=hf_config.num_attention_heads, + num_hidden_layers=hf_config.num_hidden_layers, + num_key_value_heads=hf_config.num_key_value_heads, + pad_token_id=None, + pretraining_tp=hf_config.pretraining_tp, + rms_norm_eps=hf_config.rms_norm_eps, + rope_scaling=hf_config.rope_scaling, + rope_theta=hf_config.rope_theta, + rope_interleaved=False, + tie_word_embeddings=hf_config.tie_word_embeddings, + use_cache=hf_config.use_cache, + vocab_size=hf_config.vocab_size, + ) + + + +logger = logging.get_logger(__name__) + +DEVICE = torch.device("cpu") +TORCH_DTYPE = torch.bfloat16 + + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title="Nanotron Model") + group.add_argument( + "--nanotron-checkpoint-path", + type=str, + required=True, + help="A path to a directory to store the converted Nanotron Checkpoint", + ) + + group = parser.add_argument_group(title="HuggingFace LLama3 Model") + group.add_argument( + "--pretrained-model-name-or-path-llama3", + type=str, + required=True, + help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub", + ) + + group = parser.add_argument_group(title="HuggingFace SigLIP Model") + group.add_argument( + "--pretrained-model-name-or-path-siglip", + type=str, + required=True, + help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub", + ) + + + args = parser.parse_args() + + return args + +def copy_weights_from_hf_to_nanotron_siglip( + nanotron_model: VisionTransformer, + hf_model: AutoModel, + nanotron_vision_config: Idefics3VisionConfig +): + log_rank("Copying weights from HF SigLIP model to Nanotron model...", logger=logger, level=logging.INFO, rank=0) + + # Vision Embeddings + log_rank("Copying Vision Embeddings...", logger=logger, level=logging.INFO, rank=0) + + assert ( + nanotron_model.embeddings.pp_block.patch_embedding.weight.shape == hf_model.embeddings.patch_embedding.weight.shape + ) + + assert( + nanotron_model.embeddings.pp_block.patch_embedding.bias.shape == hf_model.embeddings.patch_embedding.bias.shape + ) + + assert ( + nanotron_model.embeddings.pp_block.position_embedding.weight.shape + == hf_model.embeddings.position_embedding.weight.shape + ) + + with torch.no_grad(): + nanotron_model.embeddings.pp_block.patch_embedding.weight.copy_(hf_model.embeddings.patch_embedding.weight) + + nanotron_model.embeddings.pp_block.patch_embedding.bias.copy_(hf_model.embeddings.patch_embedding.bias) + + nanotron_model.embeddings.pp_block.position_embedding.weight.copy_(hf_model.embeddings.position_embedding.weight) + + + log_rank("Copied Vision Embeddings", logger=logger, level=logging.INFO, rank=0) + + for i in tqdm( + range(nanotron_vision_config.num_hidden_layers), + desc="Copying Vision Layers", + total=nanotron_vision_config.num_hidden_layers, + ): + assert ( + nanotron_model.encoder[i].pp_block.layer_norm1.weight.shape == hf_model.encoder.layers[i].layer_norm1.weight.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.layer_norm1.weight.copy_(hf_model.encoder.layers[i].layer_norm1.weight) + + tmp_qkv_proj = torch.cat( + [ + hf_model.encoder.layers[i].self_attn.k_proj.weight, + hf_model.encoder.layers[i].self_attn.v_proj.weight, + hf_model.encoder.layers[i].self_attn.q_proj.weight, + ], + dim=0, + ) + + assert ( + tmp_qkv_proj.shape == nanotron_model.encoder[i].pp_block.self_attn.qkv_proj.weight.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.self_attn.qkv_proj.weight.copy_(tmp_qkv_proj) + + tmp_qkv_proj_bias = torch.cat( + [ + hf_model.encoder.layers[i].self_attn.k_proj.bias, + hf_model.encoder.layers[i].self_attn.v_proj.bias, + hf_model.encoder.layers[i].self_attn.q_proj.bias, + ], + dim=0, + ) + + assert ( + tmp_qkv_proj_bias.shape == nanotron_model.encoder[i].pp_block.self_attn.qkv_proj.bias.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.self_attn.qkv_proj.bias.copy_(tmp_qkv_proj_bias) + + ## O + + assert ( + nanotron_model.encoder[i].pp_block.self_attn.o_proj.weight.shape == hf_model.encoder.layers[i].self_attn.out_proj.weight.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.self_attn.o_proj.weight.copy_(hf_model.encoder.layers[i].self_attn.out_proj.weight) + + assert ( + nanotron_model.encoder[i].pp_block.self_attn.o_proj.bias.shape == hf_model.encoder.layers[i].self_attn.out_proj.bias.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.self_attn.o_proj.bias.copy_(hf_model.encoder.layers[i].self_attn.out_proj.bias) + + # Layer Norm 2 + + assert ( + nanotron_model.encoder[i].pp_block.layer_norm2.weight.shape == hf_model.encoder.layers[i].layer_norm2.weight.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.layer_norm2.weight.copy_(hf_model.encoder.layers[i].layer_norm2.weight) + + # MLP + ## FC1 + + assert ( + nanotron_model.encoder[i].pp_block.mlp.fc1.weight.shape == hf_model.encoder.layers[i].mlp.fc1.weight.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.mlp.fc1.weight.copy_(hf_model.encoder.layers[i].mlp.fc1.weight) + + assert ( + nanotron_model.encoder[i].pp_block.mlp.fc1.bias.shape == hf_model.encoder.layers[i].mlp.fc1.bias.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.mlp.fc1.bias.copy_(hf_model.encoder.layers[i].mlp.fc1.bias) + + ## FC2 + + assert ( + nanotron_model.encoder[i].pp_block.mlp.fc2.weight.shape == hf_model.encoder.layers[i].mlp.fc2.weight.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.mlp.fc2.weight.copy_(hf_model.encoder.layers[i].mlp.fc2.weight) + + assert ( + nanotron_model.encoder[i].pp_block.mlp.fc2.bias.shape == hf_model.encoder.layers[i].mlp.fc2.bias.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.mlp.fc2.bias.copy_(hf_model.encoder.layers[i].mlp.fc2.bias) + + log_rank("Copied Vision Layers", logger=logger, level=logging.INFO, rank=0) + + # Post layer norm + + assert ( + nanotron_model.post_layernorm.pp_block.weight.shape == hf_model.post_layernorm.weight.shape + ) + + with torch.no_grad(): + nanotron_model.post_layernorm.pp_block.weight.copy_(hf_model.post_layernorm.weight) + + assert ( + nanotron_model.post_layernorm.pp_block.bias.shape == hf_model.post_layernorm.bias.shape + ) + + with torch.no_grad(): + nanotron_model.post_layernorm.pp_block.bias.copy_(hf_model.post_layernorm.bias) + + log_rank("Copied Post Layer Norm", logger=logger, level=logging.INFO, rank=0) + + +def main(args): + # Init Nanotron Parallel Utilities + parallel_config = ParallelismArgs(dp=1, pp=1, tp=1) + + parallel_context = ParallelContext( + data_parallel_size=parallel_config.dp, + pipeline_parallel_size=parallel_config.pp, + tensor_parallel_size=parallel_config.tp, + ) + + set_ranks_logging_level(parallel_context=parallel_context, logging_config=LoggingArgs()) + + # Load Llama3-8B HF model + log_rank( + f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path_llama3}", + logger=logger, + level=logging.INFO, + rank=0, + ) + + hf_model_llama = AutoModelForCausalLM.from_pretrained( + args.pretrained_model_name_or_path_llama3, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2" + ).to(DEVICE) + hf_config_llama = hf_model_llama.config + + # Set Nanotron LlamaConfig + nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config_llama) + + # Load SigLIP HF model + log_rank( + f"Loading pretrained SigLIP Model: {args.pretrained_model_name_or_path_siglip}", + logger=logger, + level=logging.INFO, + rank=0, + ) + + hf_model_siglip = AutoModel.from_pretrained( + args.pretrained_model_name_or_path_siglip, torch_dtype=TORCH_DTYPE, + attn_implementation="flash_attention_2", + ).to(DEVICE) + hf_config_siglip = hf_model_siglip.config.vision_config + + # Set Nanotron SigLIPConfig + nanotron_vision_config = Idefics3VisionConfig( + hidden_size=hf_config_siglip.hidden_size, + image_size=hf_config_siglip.image_size, + intermediate_size=hf_config_siglip.intermediate_size, + num_hidden_layers= hf_config_siglip.num_hidden_layers, + num_attention_heads=hf_config_siglip.num_attention_heads, + num_key_value_heads=hf_config_siglip.num_attention_heads, + num_channels=hf_config_siglip.num_channels, + patch_size=hf_config_siglip.patch_size, + hidden_act=hf_config_siglip.hidden_act, + layer_norm_eps=hf_config_siglip.layer_norm_eps, + attention_dropout=hf_config_siglip.attention_dropout, + is_using_mup=False + ) + + nanotron_idefics3_config = Idefics3Config( + llama_config=nanotron_llama_config, + vision_config=nanotron_vision_config, + ) + + # Init Idefics3 Nanotron model + log_rank("Init empty Nanotron Idefics3 Model", logger=logger, level=logging.INFO, rank=0) + nanotron_model = build_model( + model_builder=lambda: Idefics3ForTraining( + config=nanotron_idefics3_config, + parallel_context=parallel_context, + parallel_config=parallel_config, + ), + parallel_context=parallel_context, + dtype=TORCH_DTYPE, + device=DEVICE, + ) + + mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context) + sanity_check(root_module=nanotron_model) + + copy_weights_from_hf_to_nanotron_siglip( + nanotron_model=nanotron_model.model.vision_model, + hf_model=hf_model_siglip.vision_model, + nanotron_vision_config=nanotron_vision_config, + ) + + log_rank("Copied weights from HF SigLIP model to Nanotron model!", logger=logger, level=logging.INFO, rank=0) + + + + # Copy weights from HF to Nanotron + copy_weights_from_hf_to_nanotron_llama( + nanotron_model=nanotron_model.model.llama, + hf_model=hf_model_llama, + nanotron_llama_config=nanotron_llama_config, + ) + + log_rank("Copied weights from HF Llama model to Nanotron model!", logger=logger, level=logging.INFO, rank=0) + + + nanotron_checkpoint_path = Path( + args.nanotron_checkpoint_path + ) + + save_weights( + model=nanotron_model, + root_folder=nanotron_checkpoint_path, + parallel_context=parallel_context, + ) + + # Store Config and Model Config files + with open(nanotron_checkpoint_path / "config.yaml", "w") as f: + config = Config( + general=GeneralArgs(project="Nanotron", run="Idefics3"), + parallelism=parallel_config, + model=ModelArgs( + init_method=ExistingCheckpointInit(nanotron_checkpoint_path), + model_config=nanotron_idefics3_config, + ), + tokenizer=TokenizerArgs(nanotron_checkpoint_path), + ) + log_rank("Saving config ...", logger=logger, level=logging.INFO, rank=0) + yaml.dump(config.as_dict(), f) + + with open(nanotron_checkpoint_path / "model_config.json", "w") as f: + log_rank("Saving model config ...", logger=logger, level=logging.INFO, rank=0) + json.dump(asdict(nanotron_idefics3_config), f) + + log_rank( + f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}", + logger=logger, + level=logging.INFO, + rank=0, + ) + + +if __name__ == "__main__": + _args = get_args() + main(_args) \ No newline at end of file diff --git a/tools/idefics2/convert_hf_to_nanotron.py b/tools/idefics3/convert_hf_to_nanotron.py similarity index 85% rename from tools/idefics2/convert_hf_to_nanotron.py rename to tools/idefics3/convert_hf_to_nanotron.py index abe5a7b1..06ae1640 100644 --- a/tools/idefics2/convert_hf_to_nanotron.py +++ b/tools/idefics3/convert_hf_to_nanotron.py @@ -1,5 +1,5 @@ """ -torchrun --nproc-per-node 1 tools/idefics2/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron-ckpt --pretrained-model-name-or-path-llama3 meta-llama/Meta-Llama-3-8B-Instruct --pretrained-model-name-or-path-siglip google/siglip-base-patch16-224 +torchrun --nproc-per-node 1 tools/idefics3/convert_hf_to_nanotron.py --nanotron-checkpoint-path nanotron-ckpt --pretrained-model-name-or-path HuggingFaceM4/Idefics3-8B-Llama3 """ import sys sys.path.append('.venv/lib/python3.10/site-packages') @@ -13,12 +13,12 @@ import yaml from nanotron import logging from nanotron.config.config import Config, GeneralArgs, LoggingArgs, ModelArgs, TokenizerArgs -from nanotron.config.models_config import ExistingCheckpointInit, Idefics2VisionConfig, Idefics2Config +from nanotron.config.models_config import ExistingCheckpointInit, Idefics3VisionConfig, Idefics3Config from nanotron.config.parallelism_config import ParallelismArgs from nanotron.logging import log_rank, set_ranks_logging_level from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron from nanotron.models.base import build_model -from nanotron.models.idefics import Idefics2ForTraining, VisionTransformer +from nanotron.models.idefics import Idefics3ForTraining, Idefics3Model, VisionTransformer from nanotron.parallel.context import ParallelContext from nanotron.parallel.parameters import sanity_check from nanotron.serialize.weights import save_weights @@ -192,12 +192,12 @@ def get_args(): return args -def copy_weights_from_hf_to_nanotron_siglip( +def copy_weights_from_hf_to_nanotron_vision( nanotron_model: VisionTransformer, hf_model: AutoModel, - nanotron_vision_config: Idefics2VisionConfig + nanotron_vision_config: Idefics3VisionConfig ): - log_rank("Copying weights from HF SigLIP model to Nanotron model...", logger=logger, level=logging.INFO, rank=0) + log_rank("Copying weights from Idefic3 ViT model to Nanotron model...", logger=logger, level=logging.INFO, rank=0) # Vision Embeddings log_rank("Copying Vision Embeddings...", logger=logger, level=logging.INFO, rank=0) @@ -348,6 +348,23 @@ def copy_weights_from_hf_to_nanotron_siglip( log_rank("Copied Post Layer Norm", logger=logger, level=logging.INFO, rank=0) +def copy_weights_from_hf_to_nanotron_connector( + nanotron_model: Idefics3Model, + hf_model: AutoModel, + nanotron_config: Idefics3Config +): + log_rank("Copying weights from Idefic3 Connector to Nanotron model...", logger=logger, level=logging.INFO, rank=0) + + assert ( + nanotron_model.connector.pp_block.modality_projector.proj.weight.shape == hf_model.connector.weight.shape + ) + + with torch.no_grad(): + nanotron_model.connector.pp_block.modality_projector.proj.weight.copy_(hf_model.connector.weight) + + log_rank("Copied Connector", logger=logger, level=logging.INFO, rank=0) + + def main(args): # Init Nanotron Parallel Utilities parallel_config = ParallelismArgs(dp=1, pp=1, tp=1) @@ -362,60 +379,47 @@ def main(args): # Load Llama3-8B HF model log_rank( - f"Loading pretrained Llama3 Model: {args.pretrained_model_name_or_path_llama3}", + f"Loading pretrained Idefics3 model: {args.pretrained_model_name_or_path}", logger=logger, level=logging.INFO, rank=0, ) - hf_model_llama = AutoModelForCausalLM.from_pretrained( - args.pretrained_model_name_or_path_llama3, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2" + hf_model = AutoModelForCausalLM.from_pretrained( + args.pretrained_model_name_or_path, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2" ).to(DEVICE) - hf_config_llama = hf_model_llama.config + hf_config = hf_model.config + hf_config_vision = hf_config.vision_config # Set Nanotron LlamaConfig - nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config_llama) - - # Load SigLIP HF model - log_rank( - f"Loading pretrained SigLIP Model: {args.pretrained_model_name_or_path_siglip}", - logger=logger, - level=logging.INFO, - rank=0, - ) - - hf_model_siglip = AutoModel.from_pretrained( - args.pretrained_model_name_or_path_siglip, torch_dtype=TORCH_DTYPE, - attn_implementation="flash_attention_2", - ).to(DEVICE) - hf_config_siglip = hf_model_siglip.config.vision_config + nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config.text_config) # Set Nanotron SigLIPConfig - nanotron_vision_config = Idefics2VisionConfig( - hidden_size=hf_config_siglip.hidden_size, - image_size=hf_config_siglip.image_size, - intermediate_size=hf_config_siglip.intermediate_size, - num_hidden_layers= hf_config_siglip.num_hidden_layers, - num_attention_heads=hf_config_siglip.num_attention_heads, - num_key_value_heads=hf_config_siglip.num_attention_heads, - num_channels=hf_config_siglip.num_channels, - patch_size=hf_config_siglip.patch_size, - hidden_act=hf_config_siglip.hidden_act, - layer_norm_eps=hf_config_siglip.layer_norm_eps, - attention_dropout=hf_config_siglip.attention_dropout, + nanotron_vision_config = Idefics3VisionConfig( + hidden_size=hf_config_vision.hidden_size, + image_size=hf_config_vision.image_size, + intermediate_size=hf_config_vision.intermediate_size, + num_hidden_layers= hf_config_vision.num_hidden_layers, + num_attention_heads=hf_config_vision.num_attention_heads, + num_key_value_heads=hf_config_vision.num_attention_heads, + num_channels=hf_config_vision.num_channels, + patch_size=hf_config_vision.patch_size, + hidden_act=hf_config_vision.hidden_act, + layer_norm_eps=hf_config_vision.layer_norm_eps, + attention_dropout=hf_config_vision.attention_dropout, is_using_mup=False ) - nanotron_idefics2_config = Idefics2Config( + nanotron_idefics3_config = Idefics3Config( llama_config=nanotron_llama_config, vision_config=nanotron_vision_config, ) - # Init Idefics2 Nanotron model - log_rank("Init empty Nanotron Idefics2 Model", logger=logger, level=logging.INFO, rank=0) + # Init Idefics3 Nanotron model + log_rank("Init empty Nanotron Idefics3 Model", logger=logger, level=logging.INFO, rank=0) nanotron_model = build_model( - model_builder=lambda: Idefics2ForTraining( - config=nanotron_idefics2_config, + model_builder=lambda: Idefics3ForTraining( + config=nanotron_idefics3_config, parallel_context=parallel_context, parallel_config=parallel_config, ), @@ -427,9 +431,9 @@ def main(args): mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context) sanity_check(root_module=nanotron_model) - copy_weights_from_hf_to_nanotron_siglip( + copy_weights_from_hf_to_nanotron_vision( nanotron_model=nanotron_model.model.vision_model, - hf_model=hf_model_siglip.vision_model, + hf_model=hf_model.vision_model, nanotron_vision_config=nanotron_vision_config, ) @@ -440,13 +444,19 @@ def main(args): # Copy weights from HF to Nanotron copy_weights_from_hf_to_nanotron_llama( nanotron_model=nanotron_model.model.llama, - hf_model=hf_model_llama, + hf_model=hf_model.text_model, nanotron_llama_config=nanotron_llama_config, ) log_rank("Copied weights from HF Llama model to Nanotron model!", logger=logger, level=logging.INFO, rank=0) + copy_weights_from_hf_to_nanotron_connector( + nanotron_model=nanotron_model.model, + hf_model=hf_model, + nanotron_config=nanotron_idefics3_config, + ) + nanotron_checkpoint_path = Path( args.nanotron_checkpoint_path ) @@ -460,11 +470,11 @@ def main(args): # Store Config and Model Config files with open(nanotron_checkpoint_path / "config.yaml", "w") as f: config = Config( - general=GeneralArgs(project="Nanotron", run="Idefics2"), + general=GeneralArgs(project="Nanotron", run="Idefics3"), parallelism=parallel_config, model=ModelArgs( init_method=ExistingCheckpointInit(nanotron_checkpoint_path), - model_config=nanotron_idefics2_config, + model_config=nanotron_idefics3_config, ), tokenizer=TokenizerArgs(nanotron_checkpoint_path), ) @@ -473,7 +483,7 @@ def main(args): with open(nanotron_checkpoint_path / "model_config.json", "w") as f: log_rank("Saving model config ...", logger=logger, level=logging.INFO, rank=0) - json.dump(asdict(nanotron_idefics2_config), f) + json.dump(asdict(nanotron_idefics3_config), f) log_rank( f"Checkpoint conversion finished, check {args.nanotron_checkpoint_path}", diff --git a/tools/idefics3/generate_hf_predictions.py b/tools/idefics3/generate_hf_predictions.py new file mode 100644 index 00000000..b7198038 --- /dev/null +++ b/tools/idefics3/generate_hf_predictions.py @@ -0,0 +1,209 @@ +""" +torchrun --nproc-per-node 1 tools/idefics3/generate_hf_predictions.py --pretrained-model-name-or-path HuggingFaceM4/Idefics3-8B-Llama3 +""" + +import argparse +import os +from typing import List, Optional +from PIL import Image + +import numpy as np +import requests +import torch + + +from transformers import AutoProcessor, AutoModelForVision2Seq +from transformers.image_utils import load_image + +DEVICE = torch.device("cuda") +TORCH_DTYPE = torch.bfloat16 + +messages = [{ + "role": "user", + "content": [ + {"type": "text", "text": "What’s the difference between these two images?"}, + {"type": "image"}, + {"type": "image"}, + ], +}, +{ + "role": "assistant", + "content": [ + {"type": "text", "text": "The difference is that one image is about dogs and the other one about cats."}, + ], +}] + + +url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg" +url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" + + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title="HuggingFace Model") + group.add_argument( + "--pretrained-model-name-or-path", + type=str, + required=True, + help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub", + ) + + args = parser.parse_args() + + return args + +def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + image_hidden_states: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_seen_tokens = 0 + if use_cache: + if past_key_values is None: + past_key_values = DynamicCache() + past_seen_tokens = past_key_values.get_seq_length() + + if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0: + raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.") + + if inputs_embeds is None: + inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(self.device) + + # START VISUAL INPUTS INTEGRATION + if pixel_values is not None and image_hidden_states is not None: + raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time") + elif pixel_values is not None: + batch_size, num_images, num_channels, height, width = pixel_values.shape + pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility + pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:]) + + # Remove padding images - padding images are full 0. + nb_values_per_image = pixel_values.shape[1:].numel() + real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image + pixel_values = pixel_values[real_images_inds].contiguous() + + # Handle the vision attention mask + if pixel_attention_mask is None: + pixel_attention_mask = torch.ones( + size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)), + dtype=torch.bool, + device=pixel_values.device, + ) + else: + # Remove padding images from the mask + pixel_attention_mask = pixel_attention_mask.view( + batch_size * num_images, *pixel_attention_mask.shape[2:] + ) + pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous() + + patch_size = self.config.vision_config.patch_size + patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size) + patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # Get sequence from the vision encoder + image_hidden_states = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ).last_hidden_state + + print( + image_hidden_states.shape, # torch.Size([26, 676, 1152]) + ) + + # Modality projection & resampling + image_hidden_states = self.connector(image_hidden_states) + + elif image_hidden_states is not None: + image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device) + + if past_seen_tokens == 0 and inputs_embeds is not None and image_hidden_states is not None: + # When we generate, we don't want to replace the potential image_token_id that we generated by images + # that simply don't exist + + print( + inputs_embeds.shape, + ) + + inputs_embeds = self.inputs_merger( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + image_hidden_states=image_hidden_states, + ) + + outputs = self.text_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return tuple(v for v in [*outputs, image_hidden_states] if v is not None) + + return None + + +def main(args): + + model = AutoModelForVision2Seq.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=TORCH_DTYPE, + attn_implementation="flash_attention_2", + device_map="auto", + ).eval() + + image_1 = Image.open(requests.get(url_1, stream=True).raw) + image_2 = Image.open(requests.get(url_2, stream=True).raw) + images = [image_1, image_2] + + processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") + + text = processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE) + + + + with torch.no_grad(): + # output = model(**inputs) + + forward( + model.model, + use_cache=False, + **inputs + ) + + + +if __name__ == "__main__": + _args = get_args() + main(_args) diff --git a/tools/idefics2/generate_nanotron_predictions.py b/tools/idefics3/generate_nanotron_predictions.py similarity index 95% rename from tools/idefics2/generate_nanotron_predictions.py rename to tools/idefics3/generate_nanotron_predictions.py index 310906ac..d3383a49 100644 --- a/tools/idefics2/generate_nanotron_predictions.py +++ b/tools/idefics3/generate_nanotron_predictions.py @@ -1,5 +1,5 @@ """ -torchrun --nproc-per-node 2 tools/Idefics3/generate_nanotron_predictions.py --tp 2 --nanotron-checkpoint-path nanotron-ckpt +torchrun --nproc-per-node 2 tools/idefics3/generate_nanotron_predictions.py --tp 2 --nanotron-checkpoint-path nanotron-ckpt """ import argparse import os @@ -20,7 +20,7 @@ from nanotron.serialize import load_weights from nanotron.trainer import mark_tied_parameters # from sklearn.metrics import accuracy_score -from transformers import AutoTokenizer, Idefics3Processor +from transformers import AutoTokenizer, AutoProcessor from PIL import Image @@ -105,6 +105,9 @@ def main(args): device=DEVICE, # TODO Check with different parallelism if cpu is available ) + + #torch.Size([484, 26, 768]) + mark_tied_parameters(model=model, parallel_context=parallel_context) sanity_check(root_module=model) @@ -116,9 +119,9 @@ def main(args): image_2 = Image.open(requests.get(url_2, stream=True).raw) images = [image_1, image_2] - processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8b") + processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") - text = processor.apply_chat_template(messages, add_generation_prompt=False) + text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE) # labels = inputs.input_ids.clone() From f6bcfe3fb13026c02b138e25c2582206bc08e5a2 Mon Sep 17 00:00:00 2001 From: kisate Date: Wed, 30 Oct 2024 21:04:43 +0000 Subject: [PATCH 02/45] Constructed idefics3 runs --- src/nanotron/models/idefics.py | 13 +-- tools/idefics3/build_nanotron_from_hf.py | 25 +++-- tools/idefics3/generate_hf_predictions.py | 98 +++++++++++++++++-- .../idefics3/generate_nanotron_predictions.py | 5 +- 4 files changed, 110 insertions(+), 31 deletions(-) diff --git a/src/nanotron/models/idefics.py b/src/nanotron/models/idefics.py index 447d70f8..75ddb026 100644 --- a/src/nanotron/models/idefics.py +++ b/src/nanotron/models/idefics.py @@ -87,8 +87,7 @@ def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.B position_ids = position_ids.to(self.position_embedding.weight.device) embeddings = embeddings + self.position_embedding(position_ids) - - embeddings = embeddings.transpose(0, 1) + # embeddings = embeddings.transpose(0, 1) return { "embeddings": embeddings, @@ -565,7 +564,7 @@ def __init__( hidden_size = config.vision_config.hidden_size self.input_size = hidden_size * (config.scale_factor ** 2) - self.output_size = hidden_size + self.output_size = config.llama_config.hidden_size self.proj = TensorParallelColumnLinear( self.input_size, @@ -711,10 +710,6 @@ def forward( patch_attention_mask=patch_attention_mask, )["hidden_states"] - print( - image_hidden_states.shape, - ) - # Modality projection & resampling image_hidden_states = self.connector( hidden_states=image_hidden_states @@ -724,13 +719,13 @@ def forward( inputs_embeds = self.inputs_merger( input_ids=input_ids, - inputs_embeds=inputs_embeds["input_embeds"], + inputs_embeds=inputs_embeds["input_embeds"].transpose(0, 1), image_hidden_states=image_hidden_states["hidden_states"], ) outputs = self.llama.forward_with_hidden_states( input_ids=None, - inputs_embeds=inputs_embeds, + input_embeds=inputs_embeds, input_mask=input_mask, ) diff --git a/tools/idefics3/build_nanotron_from_hf.py b/tools/idefics3/build_nanotron_from_hf.py index d71121fb..c8a48e7f 100644 --- a/tools/idefics3/build_nanotron_from_hf.py +++ b/tools/idefics3/build_nanotron_from_hf.py @@ -1,5 +1,5 @@ """ -torchrun --nproc-per-node 1 tools/idefics3/build_nanotron_from_hf.py --nanotron-checkpoint-path nanotron-ckpt --pretrained-model-name-or-path-llama3 meta-llama/Meta-Llama-3-8B-Instruct --pretrained-model-name-or-path-siglip google/siglip-base-patch16-224 +torchrun --nproc-per-node 1 tools/idefics3/build_nanotron_from_hf.py --nanotron-checkpoint-path nanotron-ckpt --pretrained-model-name-or-path-llama3 meta-llama/Meta-Llama-3-8B-Instruct --pretrained-model-name-or-path-siglip google/siglip-so400m-patch14-384 """ import sys sys.path.append('.venv/lib/python3.10/site-packages') @@ -29,17 +29,19 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel -def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_llama_config): +def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_config, + additional_vocab_size): + nanotron_llama_config = nanotron_config.llama_config # Copy params from HF to Nanotron log_rank("Copying weights from HF model to Nanotron model...", logger=logger, level=logging.INFO, rank=0) # Token embeddings log_rank("Copying Token Embeddings...", logger=logger, level=logging.INFO, rank=0) assert ( - nanotron_model.token_position_embeddings.pp_block.token_embedding.weight.shape + nanotron_model.token_position_embeddings.pp_block.token_embedding.weight[:-additional_vocab_size].shape == hf_model.model.embed_tokens.weight.shape ) with torch.no_grad(): - nanotron_model.token_position_embeddings.pp_block.token_embedding.weight.copy_( + nanotron_model.token_position_embeddings.pp_block.token_embedding.weight[:-additional_vocab_size].copy_( hf_model.model.embed_tokens.weight ) @@ -125,11 +127,11 @@ def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_ll # LM_Head log_rank("Copying LM Head...", logger=logger, level=logging.INFO, rank=0) - assert nanotron_model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape + assert nanotron_model.lm_head.pp_block.weight[:-additional_vocab_size].shape == hf_model.lm_head.weight.shape with torch.no_grad(): - nanotron_model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight) + nanotron_model.lm_head.pp_block.weight[:-additional_vocab_size].copy_(hf_model.lm_head.weight) -def nanotron_config_from_hf_config_llama(hf_config): +def nanotron_config_from_hf_config_llama(hf_config, additional_vocab_size=3): return LlamaConfigNanotron( bos_token_id=hf_config.bos_token_id, eos_token_id=hf_config.eos_token_id, @@ -150,7 +152,7 @@ def nanotron_config_from_hf_config_llama(hf_config): rope_interleaved=False, tie_word_embeddings=hf_config.tie_word_embeddings, use_cache=hf_config.use_cache, - vocab_size=hf_config.vocab_size, + vocab_size=hf_config.vocab_size + additional_vocab_size, ) @@ -349,6 +351,7 @@ def copy_weights_from_hf_to_nanotron_siglip( def main(args): + additional_vocab_size = 4 # Init Nanotron Parallel Utilities parallel_config = ParallelismArgs(dp=1, pp=1, tp=1) @@ -373,8 +376,9 @@ def main(args): ).to(DEVICE) hf_config_llama = hf_model_llama.config + # Set Nanotron LlamaConfig - nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config_llama) + nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config_llama, additional_vocab_size) # Load SigLIP HF model log_rank( @@ -441,7 +445,8 @@ def main(args): copy_weights_from_hf_to_nanotron_llama( nanotron_model=nanotron_model.model.llama, hf_model=hf_model_llama, - nanotron_llama_config=nanotron_llama_config, + nanotron_config=nanotron_idefics3_config, + additional_vocab_size=additional_vocab_size ) log_rank("Copied weights from HF Llama model to Nanotron model!", logger=logger, level=logging.INFO, rank=0) diff --git a/tools/idefics3/generate_hf_predictions.py b/tools/idefics3/generate_hf_predictions.py index b7198038..708567af 100644 --- a/tools/idefics3/generate_hf_predictions.py +++ b/tools/idefics3/generate_hf_predictions.py @@ -52,6 +52,91 @@ def get_args(): return args + +def forward_embedding( + self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor: + batch_size, _, max_im_h, max_im_w = pixel_values.shape + + patch_embeds = self.patch_embedding(pixel_values) + + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size + boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) + position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + position_ids = position_ids.to(self.position_embedding.weight.device) + embeddings = embeddings + self.position_embedding(position_ids) + return embeddings + + +def forward_vision( + self, + pixel_values, + patch_attention_mask: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.size(0) + if patch_attention_mask is None: + patch_size = self.patch_size + patch_attention_mask = torch.ones( + ( + batch_size, + pixel_values.size(2) // patch_size, + pixel_values.size(3) // patch_size, + ) + ) + patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device) + + hidden_states = forward_embedding(self.embeddings, pixel_values=pixel_values, patch_attention_mask=patch_attention_mask) + + patch_attention_mask = patch_attention_mask.view(batch_size, -1) + # The call to `_upad_input` in `_flash_attention_forward` is expensive + # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence), + # avoiding passing the attention_mask, which is equivalent to attending to the full sequence + if not torch.any(~patch_attention_mask): + patch_attention_mask = None + elif not self._use_flash_attention_2: + patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=patch_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state,) + encoder_outputs[1:] + + return last_hidden_state + + def forward( self, input_ids: torch.LongTensor = None, @@ -127,13 +212,10 @@ def forward( patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() # Get sequence from the vision encoder - image_hidden_states = self.vision_model( + image_hidden_states = forward_vision( + self.vision_model, pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, - ).last_hidden_state - - print( - image_hidden_states.shape, # torch.Size([26, 676, 1152]) ) # Modality projection & resampling @@ -146,10 +228,6 @@ def forward( # When we generate, we don't want to replace the potential image_token_id that we generated by images # that simply don't exist - print( - inputs_embeds.shape, - ) - inputs_embeds = self.inputs_merger( input_ids=input_ids, inputs_embeds=inputs_embeds, @@ -173,6 +251,7 @@ def forward( return None + def main(args): model = AutoModelForVision2Seq.from_pretrained( @@ -187,7 +266,6 @@ def main(args): images = [image_1, image_2] processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") - text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE) diff --git a/tools/idefics3/generate_nanotron_predictions.py b/tools/idefics3/generate_nanotron_predictions.py index d3383a49..4bf01de4 100644 --- a/tools/idefics3/generate_nanotron_predictions.py +++ b/tools/idefics3/generate_nanotron_predictions.py @@ -119,7 +119,9 @@ def main(args): image_2 = Image.open(requests.get(url_2, stream=True).raw) images = [image_1, image_2] - processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") + target_image_seq_len = int(((364 // nanotron_config.model.model_config.vision_config.patch_size) ** 2) / (nanotron_config.model.model_config.scale_factor**2)) + + processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=target_image_seq_len) text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE) @@ -130,7 +132,6 @@ def main(args): seq_length = inputs.input_ids.size(1) - print(inputs.keys()) inputs = { "input_ids": inputs['input_ids'], From 12a71c195de6ea2a00a16d2a8e5ac8e1cf987079 Mon Sep 17 00:00:00 2001 From: kisate Date: Wed, 30 Oct 2024 22:21:34 +0000 Subject: [PATCH 03/45] Convert from hf produces working weights --- src/nanotron/config/config.py | 4 ++ src/nanotron/config/models_config.py | 2 +- tmp.py | 21 ++++++ tools/idefics3/convert_hf_to_nanotron.py | 89 +++++++++++------------- 4 files changed, 66 insertions(+), 50 deletions(-) create mode 100644 tmp.py diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index adc1eafd..7e331858 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -423,6 +423,10 @@ def get_config_from_dict( for k, v in config_dict.items() if v is not None } + + print( + config_dict + ) return from_dict( data_class=config_class, data=config_dict, diff --git a/src/nanotron/config/models_config.py b/src/nanotron/config/models_config.py index 4222395d..298c00cd 100644 --- a/src/nanotron/config/models_config.py +++ b/src/nanotron/config/models_config.py @@ -33,7 +33,7 @@ class LlamaConfig: """ bos_token_id: int = 1 - eos_token_id: int = 2 + eos_token_id: Union[int, List[int]] = 2 hidden_act: str = "silu" hidden_size: int = 4096 initializer_range: float = 0.02 diff --git a/tmp.py b/tmp.py new file mode 100644 index 00000000..d74ae89c --- /dev/null +++ b/tmp.py @@ -0,0 +1,21 @@ +a = {'checkpoints': None, 'data_stages': None, 'general': {'benchmark_csv_path': None, 'consumed_train_samples': None, 'entity': None, 'ignore_sanity_checks': True, 'project': 'Nanotron', 'run': 'Idefics3', 'seed': 42, 'step': None}, 'lighteval': None, 'logging': None, 'model': {'ddp_bucket_cap_mb': 25, 'dtype': 'bfloat16', 'init_method': {'path': 'nanotron-ckpt'}, 'make_vocab_size_divisible_by': 1, 'model_config': {'image_token_id': 128257, 'llama_config': {'bos_token_id': 128000, 'eos_token_id': [128001, 128008, 128009], 'hidden_act': 'silu', 'hidden_size': 4096, 'initializer_range': 0.02, 'intermediate_size': 14336, 'is_llama_config': True, 'max_position_embeddings': 131072, 'num_attention_heads': 32, 'num_hidden_layers': 32, 'num_key_value_heads': 8, 'pad_token_id': None, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_interleaved': False, 'rope_scaling': {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'rope_theta': 500000.0, 'tie_word_embeddings': False, 'use_cache': True, 'vocab_size': 128260}, 'pad_token_id': None, 'scale_factor': 2, 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 364, 'intermediate_size': 4304, 'is_using_mup': False, 'layer_norm_eps': 1e-06, 'num_attention_heads': 16, 'num_channels':3, 'num_hidden_layers': 27, 'num_key_value_heads': 16, 'patch_size': 14}}}, 'optimizer': None, 'parallelism': {'dp': 1, 'expert_parallel_size': 1, 'pp': 1, 'pp_engine': 'afab', 'recompute_layer': False,'tp': 1, 'tp_linear_async_communication': False, 'tp_mode': 'ALL_REDUCE', 'tp_recompute_allgather': True}, 'profiler': None, 'tokenizer': {'tokenizer_max_length': None, 'tokenizer_name_or_path': 'nanotron-ckpt', 'tokenizer_revision': None}, 'tokens':None} + + +g = {'checkpoints': None, 'data_stages': None, 'general': {'benchmark_csv_path': None, 'consumed_train_samples': None, 'entity': None, 'ignore_sanity_checks': True, 'project': 'Nanotron', 'run': 'Idefics3', 'seed': 42, 'step': None},'lighteval': None, 'logging': None, 'model': {'ddp_bucket_cap_mb': 25, 'dtype': 'bfloat16', 'init_method': {'path': 'nanotron-ckpt'}, 'make_vocab_size_divisible_by': 1, 'model_config': {'image_token_id': 128257, 'llama_config': {'bos_token_id': 128000, 'eos_token_id': 128009, 'hidden_act': 'silu', 'hidden_size': 4096, 'initializer_range': 0.02, 'intermediate_size': 14336, 'is_llama_config': True, 'max_position_embeddings': 8192, 'num_attention_heads': 32, 'num_hidden_layers': 32, 'num_key_value_heads': 8, 'pad_token_id': None, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_interleaved': False, 'rope_scaling': None, 'rope_theta': 500000.0, 'tie_word_embeddings': False, 'use_cache': True, 'vocab_size': 128260}, 'pad_token_id': 128002, 'scale_factor': 2, 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 384, 'intermediate_size': 4304, 'is_using_mup': False, 'layer_norm_eps': 1e-06, 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'num_key_value_heads': 16, 'patch_size': 14}}}, 'optimizer': None, 'parallelism': {'dp': 1, 'expert_parallel_size': 1, 'pp': 1, 'pp_engine': 'afab', 'recompute_layer': False, 'tp': 1, 'tp_linear_async_communication': False, 'tp_mode': 'ALL_REDUCE', 'tp_recompute_allgather': True}, 'profiler': None, 'tokenizer': {'tokenizer_max_length':None, 'tokenizer_name_or_path': 'nanotron-ckpt', 'tokenizer_revision': None}, 'tokens': None} + +# print(b) + +def compare_dicts(a, b): + all_keys = set(a.keys()) | set(b.keys()) + for key in all_keys: + if key in a and key in b: + if isinstance(a[key], dict) and isinstance(b[key], dict): + compare_dicts(a[key], b[key]) + elif a[key] != b[key]: + print(f"Key: {key}, Value in a: {a[key]}, Value in b: {b[key]}") + if key in a and key not in b: + print(f"Key: {key} not in b") + if key in b and key not in a: + print(f"Key: {key} not in a") + +compare_dicts(a, g) diff --git a/tools/idefics3/convert_hf_to_nanotron.py b/tools/idefics3/convert_hf_to_nanotron.py index 06ae1640..63bc5699 100644 --- a/tools/idefics3/convert_hf_to_nanotron.py +++ b/tools/idefics3/convert_hf_to_nanotron.py @@ -26,21 +26,23 @@ # from tools.llama3.convert_hf_to_nanotron import copy_weights_from_hf_to_nanotron as copy_weights_from_hf_to_nanotron_llama # from tools.llama3.convert_hf_to_nanotron import nanotron_config_from_hf_config as nanotron_config_from_hf_config_llama -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel +from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoModel -def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_llama_config): +def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_config, + additional_vocab_size): + nanotron_llama_config = nanotron_config.llama_config # Copy params from HF to Nanotron log_rank("Copying weights from HF model to Nanotron model...", logger=logger, level=logging.INFO, rank=0) # Token embeddings log_rank("Copying Token Embeddings...", logger=logger, level=logging.INFO, rank=0) assert ( - nanotron_model.token_position_embeddings.pp_block.token_embedding.weight.shape - == hf_model.model.embed_tokens.weight.shape + nanotron_model.token_position_embeddings.pp_block.token_embedding.weight[:-additional_vocab_size].shape + == hf_model.embed_tokens.weight.shape ) with torch.no_grad(): - nanotron_model.token_position_embeddings.pp_block.token_embedding.weight.copy_( - hf_model.model.embed_tokens.weight + nanotron_model.token_position_embeddings.pp_block.token_embedding.weight[:-additional_vocab_size].copy_( + hf_model.embed_tokens.weight ) # Decoder layers @@ -51,21 +53,21 @@ def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_ll ): # Input layer norm assert ( - hf_model.model.layers[i].input_layernorm.weight.shape + hf_model.layers[i].input_layernorm.weight.shape == nanotron_model.decoder[i].pp_block.input_layernorm.weight.shape ) with torch.no_grad(): nanotron_model.decoder[i].pp_block.input_layernorm.weight.copy_( - hf_model.model.layers[i].input_layernorm.weight + hf_model.layers[i].input_layernorm.weight ) # Self attn ## QKV tmp_qkv_proj = torch.cat( [ - hf_model.model.layers[i].self_attn.q_proj.weight, - hf_model.model.layers[i].self_attn.k_proj.weight, - hf_model.model.layers[i].self_attn.v_proj.weight, + hf_model.layers[i].self_attn.q_proj.weight, + hf_model.layers[i].self_attn.k_proj.weight, + hf_model.layers[i].self_attn.v_proj.weight, ], dim=0, ) @@ -75,20 +77,20 @@ def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_ll ## O assert ( - hf_model.model.layers[i].self_attn.o_proj.weight.shape + hf_model.layers[i].self_attn.o_proj.weight.shape == nanotron_model.decoder[i].pp_block.attn.o_proj.weight.shape ) with torch.no_grad(): nanotron_model.decoder[i].pp_block.attn.o_proj.weight.copy_( - hf_model.model.layers[i].self_attn.o_proj.weight + hf_model.layers[i].self_attn.o_proj.weight ) # MLP ## Gate Up Proj tmp_gate_up_proj = torch.cat( [ - hf_model.model.layers[i].mlp.gate_proj.weight, - hf_model.model.layers[i].mlp.up_proj.weight, + hf_model.layers[i].mlp.gate_proj.weight, + hf_model.layers[i].mlp.up_proj.weight, ], dim=0, ) @@ -99,37 +101,31 @@ def copy_weights_from_hf_to_nanotron_llama(nanotron_model, hf_model, nanotron_ll ## Down Proj assert ( - hf_model.model.layers[i].mlp.down_proj.weight.shape + hf_model.layers[i].mlp.down_proj.weight.shape == nanotron_model.decoder[i].pp_block.mlp.down_proj.weight.shape ) with torch.no_grad(): nanotron_model.decoder[i].pp_block.mlp.down_proj.weight.copy_( - hf_model.model.layers[i].mlp.down_proj.weight + hf_model.layers[i].mlp.down_proj.weight ) # Post attn layer norm assert ( - hf_model.model.layers[i].post_attention_layernorm.weight.shape + hf_model.layers[i].post_attention_layernorm.weight.shape == nanotron_model.decoder[i].pp_block.post_attention_layernorm.weight.shape ) with torch.no_grad(): nanotron_model.decoder[i].pp_block.post_attention_layernorm.weight.copy_( - hf_model.model.layers[i].post_attention_layernorm.weight + hf_model.layers[i].post_attention_layernorm.weight ) # Last layer norm log_rank("Copying Final Layer Norm...", logger=logger, level=logging.INFO, rank=0) - assert nanotron_model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape + assert nanotron_model.final_layer_norm.pp_block.weight.shape == hf_model.norm.weight.shape with torch.no_grad(): - nanotron_model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight) + nanotron_model.final_layer_norm.pp_block.weight.copy_(hf_model.norm.weight) - # LM_Head - log_rank("Copying LM Head...", logger=logger, level=logging.INFO, rank=0) - assert nanotron_model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape - with torch.no_grad(): - nanotron_model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight) - -def nanotron_config_from_hf_config_llama(hf_config): +def nanotron_config_from_hf_config_llama(hf_config, additional_vocab_size=3): return LlamaConfigNanotron( bos_token_id=hf_config.bos_token_id, eos_token_id=hf_config.eos_token_id, @@ -150,7 +146,7 @@ def nanotron_config_from_hf_config_llama(hf_config): rope_interleaved=False, tie_word_embeddings=hf_config.tie_word_embeddings, use_cache=hf_config.use_cache, - vocab_size=hf_config.vocab_size, + vocab_size=hf_config.vocab_size + additional_vocab_size, ) @@ -171,23 +167,14 @@ def get_args(): help="A path to a directory to store the converted Nanotron Checkpoint", ) - group = parser.add_argument_group(title="HuggingFace LLama3 Model") + group = parser.add_argument_group(title="HuggingFace Idefic3 Model") group.add_argument( - "--pretrained-model-name-or-path-llama3", + "--pretrained-model-name-or-path", type=str, required=True, help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub", ) - group = parser.add_argument_group(title="HuggingFace SigLIP Model") - group.add_argument( - "--pretrained-model-name-or-path-siglip", - type=str, - required=True, - help="A path to a directory containing model weights saved using save_pretrained() or the model id of a pretrained model hosted inside a model repo on the Hugging Face Hub", - ) - - args = parser.parse_args() return args @@ -356,16 +343,17 @@ def copy_weights_from_hf_to_nanotron_connector( log_rank("Copying weights from Idefic3 Connector to Nanotron model...", logger=logger, level=logging.INFO, rank=0) assert ( - nanotron_model.connector.pp_block.modality_projector.proj.weight.shape == hf_model.connector.weight.shape + nanotron_model.connector.pp_block.modality_projector.proj.weight.shape == hf_model.connector.modality_projection.proj.weight.shape ) with torch.no_grad(): - nanotron_model.connector.pp_block.modality_projector.proj.weight.copy_(hf_model.connector.weight) + nanotron_model.connector.pp_block.modality_projector.proj.weight.copy_(hf_model.connector.modality_projection.proj.weight) log_rank("Copied Connector", logger=logger, level=logging.INFO, rank=0) def main(args): + additional_vocab_size = 1 # Init Nanotron Parallel Utilities parallel_config = ParallelismArgs(dp=1, pp=1, tp=1) @@ -385,14 +373,14 @@ def main(args): rank=0, ) - hf_model = AutoModelForCausalLM.from_pretrained( + hf_model = AutoModelForVision2Seq.from_pretrained( args.pretrained_model_name_or_path, torch_dtype=TORCH_DTYPE, attn_implementation="flash_attention_2" ).to(DEVICE) hf_config = hf_model.config hf_config_vision = hf_config.vision_config # Set Nanotron LlamaConfig - nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config.text_config) + nanotron_llama_config = nanotron_config_from_hf_config_llama(hf_config.text_config, additional_vocab_size) # Set Nanotron SigLIPConfig nanotron_vision_config = Idefics3VisionConfig( @@ -413,8 +401,10 @@ def main(args): nanotron_idefics3_config = Idefics3Config( llama_config=nanotron_llama_config, vision_config=nanotron_vision_config, + image_token_id=hf_config.image_token_id, + pad_token_id=hf_config.vision_config.pad_token_id, + scale_factor=hf_config.scale_factor, ) - # Init Idefics3 Nanotron model log_rank("Init empty Nanotron Idefics3 Model", logger=logger, level=logging.INFO, rank=0) nanotron_model = build_model( @@ -433,7 +423,7 @@ def main(args): copy_weights_from_hf_to_nanotron_vision( nanotron_model=nanotron_model.model.vision_model, - hf_model=hf_model.vision_model, + hf_model=hf_model.model.vision_model, nanotron_vision_config=nanotron_vision_config, ) @@ -444,8 +434,9 @@ def main(args): # Copy weights from HF to Nanotron copy_weights_from_hf_to_nanotron_llama( nanotron_model=nanotron_model.model.llama, - hf_model=hf_model.text_model, - nanotron_llama_config=nanotron_llama_config, + hf_model=hf_model.model.text_model, + nanotron_config=nanotron_idefics3_config, + additional_vocab_size=additional_vocab_size, ) log_rank("Copied weights from HF Llama model to Nanotron model!", logger=logger, level=logging.INFO, rank=0) @@ -453,7 +444,7 @@ def main(args): copy_weights_from_hf_to_nanotron_connector( nanotron_model=nanotron_model.model, - hf_model=hf_model, + hf_model=hf_model.model, nanotron_config=nanotron_idefics3_config, ) From f3986c8aaef3083e21c7742d8326382fe6fbb87c Mon Sep 17 00:00:00 2001 From: kisate Date: Mon, 11 Nov 2024 01:50:59 +0000 Subject: [PATCH 04/45] Fixed attention calculation differences --- compare_outputs.py | 39 +++ src/nanotron/config/config.py | 3 - src/nanotron/models/idefics.py | 152 +++++++---- tools/idefics3/convert_hf_to_nanotron.py | 22 +- tools/idefics3/generate_hf_predictions.py | 257 +++++++++++++++++- .../idefics3/generate_nanotron_predictions.py | 5 + 6 files changed, 420 insertions(+), 58 deletions(-) create mode 100644 compare_outputs.py diff --git a/compare_outputs.py b/compare_outputs.py new file mode 100644 index 00000000..69a64bdb --- /dev/null +++ b/compare_outputs.py @@ -0,0 +1,39 @@ +import torch +hf = torch.load( + f"img_emb_hf_{26}_after_mlp.pt", +).detach().cpu() +nano = torch.load( + f"img_emb_nano_{26}_after_mlp.pt", +).detach().cpu() + + +def compare_outputs(hf, nano): + print( + hf.shape, nano.shape + ) + + print( + f"Max difference between outputs: {torch.max(torch.abs(hf - nano))}" + ) + print( + f"Mean difference between outputs: {torch.mean(torch.abs(hf - nano))}" + ) + print( + f"Relative difference between outputs: {torch.mean(torch.abs(hf - nano) / (torch.abs(hf) + 1e-6))}" + ) + + # print( + # hf[0, 0, :15], nano[0, 0, :15] + # ) + +compare_outputs(hf, nano) + + + +# a = torch.load(f"img_emb_hf_{0}_attention_mask.pt") +# b = torch.load(f"img_emb_hf_{0}_args.pt") +# c = torch.load(f"img_emb_nano_{0}_qkv.pt") + +# print(a) +# print(b) +# print(c.shape) \ No newline at end of file diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index 7e331858..75aa0533 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -424,9 +424,6 @@ def get_config_from_dict( if v is not None } - print( - config_dict - ) return from_dict( data_class=config_class, data=config_dict, diff --git a/src/nanotron/models/idefics.py b/src/nanotron/models/idefics.py index 75ddb026..f8f4e582 100644 --- a/src/nanotron/models/idefics.py +++ b/src/nanotron/models/idefics.py @@ -24,7 +24,7 @@ from nanotron.scaling.parametrization import SpectralMupParametrizator, StandardParametrizator from nanotron.utils import checkpoint_method from nanotron.models.llama import GLUActivation, LlamaModel, pad_to_right, Loss - +from transformers.modeling_flash_attention_utils import _flash_attention_forward logger = logging.get_logger(__name__) @@ -139,9 +139,9 @@ def forward( class VisionSelfAttention(nn.Module, AttachableStore): def __init__(self, config: Idefics3VisionConfig, parallel_config: Optional[ParallelismArgs], - tp_pg: dist.ProcessGroup): + tp_pg: dist.ProcessGroup, layer_idx: int): super().__init__() - + self.layer_idx = layer_idx assert ( config.num_attention_heads % tp_pg.size() == 0 ), f"Number of attention heads ({config.num_attention_heads}) must be divisible by TP size ({tp_pg.size()})." @@ -219,10 +219,15 @@ def forward( flash_attn_with_kvcache, ) + hidden_states = hidden_states + qkv_states = self.qkv_proj( hidden_states - ) # [seq_length, batch_size, n_local_q_heads * d_qk + 2 * n_local_kv_heads * d_qk] - q_length, batch_size, _ = qkv_states.shape + ) # [batch_size, seq_length, n_local_q_heads * d_qk + 2 * n_local_kv_heads * d_qk] + batch_size, q_length, _ = qkv_states.shape + + + torch.save(qkv_states, f"img_emb_nano_{self.layer_idx}_qkv.pt") if self.is_gqa: query_states, key_states, value_states = torch.split( @@ -236,18 +241,18 @@ def forward( ) query_states = ( - query_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_q_heads, self.d_qk) + query_states.contiguous().view(batch_size, q_length, self.n_local_q_heads, self.d_qk) ) key_states = ( - key_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) + key_states.contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) ) value_states = ( - value_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) + value_states.contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) ) else: query_states, key_states, value_states = ( - qkv_states.view(q_length, batch_size, 3, self.n_local_q_heads, self.d_qk) - .permute(2, 1, 0, 3, 4) + qkv_states.view(batch_size, q_length, 3, self.n_local_q_heads, self.d_qk) + .permute(2, 0, 1, 3, 4) .contiguous() ) # [3, batch_size, seq_length, n_local_q_heads, d_qk] @@ -267,16 +272,41 @@ def forward( key_states = key_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_qk) value_states = value_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_v) - attention_output = self.attention( - query_states=query_states, - key_states=key_states, - value_states=value_states, - ) + torch.save(query_states, f"img_emb_nano_{self.layer_idx}_query_states.pt") + torch.save(key_states, f"img_emb_nano_{self.layer_idx}_key_states.pt") + torch.save(value_states, f"img_emb_nano_{self.layer_idx}_value_states.pt") + + # attention_output = self.attention( + # query_states=query_states, + # key_states=key_states, + # value_states=value_states, + # ) - attention_output = ( - attention_output.contiguous().view(batch_size, q_length, self.n_local_q_heads * self.d_v).transpose(0, 1) + # attention_output = ( + # attention_output.contiguous().view(batch_size, q_length, self.n_local_q_heads * self.d_v).transpose(0, 1) + # ) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + None, + q_length, + dropout=0.0, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=False, + is_causal=False, + # **kwargs, ) - output = self.o_proj(attention_output) + + + + attn_output = attn_output.contiguous().view(batch_size, q_length, self.d_model) + + + torch.save(attn_output, f"img_emb_nano_{self.layer_idx}_attn_output.pt") + + output = self.o_proj(attn_output) return {"hidden_states": output, "sequence_mask": sequence_mask} @@ -331,7 +361,8 @@ def __init__( self, config: Idefics3VisionConfig, parallel_config: Optional[ParallelismArgs], - tp_pg: dist.ProcessGroup + tp_pg: dist.ProcessGroup, + layer_id: int, ): super().__init__() @@ -339,6 +370,7 @@ def __init__( config, parallel_config=parallel_config, tp_pg=tp_pg, + layer_idx=layer_id, ) self.layer_norm1 = TritonLayerNorm( @@ -359,6 +391,7 @@ def __init__( ) self.tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE + self.layer_id = layer_id def forward( @@ -367,17 +400,33 @@ def forward( sequence_mask: Union[torch.Tensor, TensorPointer], ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: residual = hidden_states + + torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_before_ln1.pt") + hidden_states = self.layer_norm1(hidden_states) + + torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_ln1.pt") + output = self.self_attn(hidden_states=hidden_states, sequence_mask=sequence_mask) hidden_states = output["hidden_states"] + torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_attn.pt") + hidden_states = hidden_states + residual + + torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_before_ln2.pt") + residual = hidden_states hidden_states = self.layer_norm2(hidden_states) + + torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_ln2.pt") + hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] + torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_mlp.pt") + hidden_states = hidden_states + residual return { @@ -425,6 +474,7 @@ def __init__( "config": config.vision_config, "parallel_config": parallel_config, "tp_pg": parallel_context.tp_pg, + "layer_id": i, }, module_input_keys={ "hidden_states", @@ -436,7 +486,7 @@ def __init__( } ) - for _ in range(config.vision_config.num_hidden_layers) + for i in range(config.vision_config.num_hidden_layers) ] ) @@ -476,6 +526,9 @@ def forward( patch_attention_mask=patch_attention_mask, )["embeddings"] + + torch.save(hidden_states, "img_emb_nano.pt") + patch_attention_mask = patch_attention_mask.view(batch_size, -1) hidden_encoder_states = { @@ -483,9 +536,11 @@ def forward( "sequence_mask": patch_attention_mask, } - for encoder_layer in self.encoder: + for i, encoder_layer in enumerate(self.encoder): hidden_encoder_states = encoder_layer(**hidden_encoder_states) + torch.save(hidden_encoder_states["hidden_states"], f"img_emb_nano_{i}.pt") + hidden_states = hidden_encoder_states["hidden_states"] hidden_states = self.post_layernorm(input=hidden_states) @@ -565,15 +620,10 @@ def __init__( self.input_size = hidden_size * (config.scale_factor ** 2) self.output_size = config.llama_config.hidden_size - - self.proj = TensorParallelColumnLinear( + self.proj = nn.Linear( self.input_size, self.output_size, - pg=tp_pg, - mode=tp_mode, - bias=False, - async_communication=tp_linear_async_communication, - tp_recompute_allgather=parallel_config.tp_recompute_allgather, + bias = False ) def forward(self, hidden_states): @@ -587,13 +637,24 @@ def __init__( config: Idefics3Config, parallel_config: Optional[ParallelismArgs], tp_pg: dist.ProcessGroup, + p2p ): super().__init__() self.scale_factor = config.scale_factor - self.modality_projector = Idefics3SimpleMLP( - config=config, - parallel_config=parallel_config, - tp_pg=tp_pg, + self.modality_projector = PipelineBlock( + p2p=p2p, + module_builder=Idefics3SimpleMLP, + module_kwargs={ + "config": config, + "parallel_config": parallel_config, + "tp_pg": tp_pg, + }, + module_input_keys={ + "hidden_states" + }, + module_output_keys={ + "hidden_states" + } ) def pixel_shuffle(self, x, scale_factor=2): @@ -610,7 +671,7 @@ def pixel_shuffle(self, x, scale_factor=2): def forward(self, hidden_states): hidden_states = self.pixel_shuffle(hidden_states, self.scale_factor) - hidden_states = self.modality_projector(hidden_states)["hidden_states"] + hidden_states = self.modality_projector(hidden_states=hidden_states)["hidden_states"] return {"hidden_states": hidden_states} class Idefics3Model(nn.Module): @@ -638,20 +699,8 @@ def __init__( parallel_config=parallel_config, ) - self.connector = PipelineBlock( - p2p=self.llama.p2p, - module_builder=Idefics3Connector, - module_kwargs={ - "config": config, - "parallel_config": parallel_config, - "tp_pg": parallel_context.tp_pg, - }, - module_input_keys={ - "hidden_states" - }, - module_output_keys={ - "hidden_states" - } + self.connector = Idefics3Connector( + config, parallel_config, parallel_context.tp_pg, self.llama.p2p ) self.image_seq_len = int( @@ -664,6 +713,9 @@ def inputs_merger( inputs_embeds: Union[torch.Tensor, TensorPointer], image_hidden_states: Union[torch.Tensor, TensorPointer], ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + print( + inputs_embeds.shape, image_hidden_states.shape + ) num_images, _, vision_hidden_size = image_hidden_states.shape special_image_token_mask = input_ids == self.image_token_id new_inputs_embeds = inputs_embeds.clone() @@ -704,12 +756,16 @@ def forward( patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) == patch_size * patch_size).bool() + torch.save(pixel_values, "pv_nano.pt") + # Get sequence from the vision encoder image_hidden_states = self.vision_model( pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, )["hidden_states"] + torch.save(image_hidden_states, "img_hid_nano.pt") + # Modality projection & resampling image_hidden_states = self.connector( hidden_states=image_hidden_states @@ -717,6 +773,8 @@ def forward( inputs_embeds = self.llama.token_position_embeddings(input_ids=input_ids, input_mask=input_mask) + torch.save(inputs_embeds["input_embeds"], "input_embeds_nano.pt") + inputs_embeds = self.inputs_merger( input_ids=input_ids, inputs_embeds=inputs_embeds["input_embeds"].transpose(0, 1), diff --git a/tools/idefics3/convert_hf_to_nanotron.py b/tools/idefics3/convert_hf_to_nanotron.py index 63bc5699..fc5fea49 100644 --- a/tools/idefics3/convert_hf_to_nanotron.py +++ b/tools/idefics3/convert_hf_to_nanotron.py @@ -224,11 +224,18 @@ def copy_weights_from_hf_to_nanotron_vision( with torch.no_grad(): nanotron_model.encoder[i].pp_block.layer_norm1.weight.copy_(hf_model.encoder.layers[i].layer_norm1.weight) + assert ( + nanotron_model.encoder[i].pp_block.layer_norm1.bias.shape == hf_model.encoder.layers[i].layer_norm1.bias.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.layer_norm1.bias.copy_(hf_model.encoder.layers[i].layer_norm1.bias) + tmp_qkv_proj = torch.cat( [ + hf_model.encoder.layers[i].self_attn.q_proj.weight, hf_model.encoder.layers[i].self_attn.k_proj.weight, hf_model.encoder.layers[i].self_attn.v_proj.weight, - hf_model.encoder.layers[i].self_attn.q_proj.weight, ], dim=0, ) @@ -242,9 +249,9 @@ def copy_weights_from_hf_to_nanotron_vision( tmp_qkv_proj_bias = torch.cat( [ + hf_model.encoder.layers[i].self_attn.q_proj.bias, hf_model.encoder.layers[i].self_attn.k_proj.bias, hf_model.encoder.layers[i].self_attn.v_proj.bias, - hf_model.encoder.layers[i].self_attn.q_proj.bias, ], dim=0, ) @@ -281,6 +288,13 @@ def copy_weights_from_hf_to_nanotron_vision( with torch.no_grad(): nanotron_model.encoder[i].pp_block.layer_norm2.weight.copy_(hf_model.encoder.layers[i].layer_norm2.weight) + assert ( + nanotron_model.encoder[i].pp_block.layer_norm2.bias.shape == hf_model.encoder.layers[i].layer_norm2.bias.shape + ) + + with torch.no_grad(): + nanotron_model.encoder[i].pp_block.layer_norm2.bias.copy_(hf_model.encoder.layers[i].layer_norm2.bias) + # MLP ## FC1 @@ -343,11 +357,11 @@ def copy_weights_from_hf_to_nanotron_connector( log_rank("Copying weights from Idefic3 Connector to Nanotron model...", logger=logger, level=logging.INFO, rank=0) assert ( - nanotron_model.connector.pp_block.modality_projector.proj.weight.shape == hf_model.connector.modality_projection.proj.weight.shape + nanotron_model.connector.modality_projector.pp_block.proj.weight.shape == hf_model.connector.modality_projection.proj.weight.shape ) with torch.no_grad(): - nanotron_model.connector.pp_block.modality_projector.proj.weight.copy_(hf_model.connector.modality_projection.proj.weight) + nanotron_model.connector.modality_projector.pp_block.proj.weight.copy_(hf_model.connector.modality_projection.proj.weight) log_rank("Copied Connector", logger=logger, level=logging.INFO, rank=0) diff --git a/tools/idefics3/generate_hf_predictions.py b/tools/idefics3/generate_hf_predictions.py index 708567af..1f318827 100644 --- a/tools/idefics3/generate_hf_predictions.py +++ b/tools/idefics3/generate_hf_predictions.py @@ -14,6 +14,7 @@ from transformers import AutoProcessor, AutoModelForVision2Seq from transformers.image_utils import load_image +from transformers.modeling_flash_attention_utils import _flash_attention_forward DEVICE = torch.device("cuda") TORCH_DTYPE = torch.bfloat16 @@ -82,6 +83,234 @@ def forward_embedding( embeddings = embeddings + self.position_embedding(position_ids) return embeddings +def forward_attn( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value = None, + output_attentions: bool = False, + use_cache: bool = False, + layer_idx: int = 0, + **kwargs, + ): + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + combined = torch.cat([key_states, value_states, query_states], dim=-1) + + torch.save(combined, f"img_emb_hf_{layer_idx}_qkv.pt") + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (Idefics2VisionRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + # logger.warning_once( + # f"The input hidden states seems to be silently casted in float32, this might be related to" + # f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + # f" {target_dtype}." + # ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + torch.save(query_states, f"img_emb_hf_{layer_idx}_query_states.pt") + torch.save(key_states, f"img_emb_hf_{layer_idx}_key_states.pt") + torch.save(value_states, f"img_emb_hf_{layer_idx}_value_states.pt") + torch.save(attention_mask, f"img_emb_hf_{layer_idx}_attention_mask.pt") + torch.save((q_len, dropout_rate, self.is_causal, self._flash_attn_uses_top_left_mask), f"img_emb_hf_{layer_idx}_args.pt") + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous() + + + torch.save(attn_output, f"img_emb_hf_{layer_idx}_attn_output.pt") + + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +def forward_encoder_layer( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_idx: int, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + torch.save(hidden_states, f"img_emb_hf_{layer_idx}_before_ln1.pt") + + hidden_states = self.layer_norm1(hidden_states) + + torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_ln1.pt") + + hidden_states, attn_weights = forward_attn( + self.self_attn, + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + layer_idx=layer_idx, + ) + + torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_attn.pt") + + hidden_states = residual + hidden_states + + torch.save(hidden_states, f"img_emb_hf_{layer_idx}_before_ln2.pt") + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + + torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_ln2.pt") + + hidden_states = self.mlp(hidden_states) + + torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_mlp.pt") + + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + +# Ignore copy +def forward_encoder( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = forward_encoder_layer( + encoder_layer, + hidden_states, + attention_mask, + layer_idx=i, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + torch.save(hidden_states, f"img_emb_hf_{i}.pt") + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return hidden_states, encoder_states, all_attentions + def forward_vision( self, @@ -110,6 +339,8 @@ def forward_vision( patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device) hidden_states = forward_embedding(self.embeddings, pixel_values=pixel_values, patch_attention_mask=patch_attention_mask) + + torch.save(hidden_states, "img_emb_hf.pt") patch_attention_mask = patch_attention_mask.view(batch_size, -1) # The call to `_upad_input` in `_flash_attention_forward` is expensive @@ -120,7 +351,8 @@ def forward_vision( elif not self._use_flash_attention_2: patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) - encoder_outputs = self.encoder( + encoder_outputs = forward_encoder( + self.encoder, inputs_embeds=hidden_states, attention_mask=patch_attention_mask, output_attentions=output_attentions, @@ -179,6 +411,9 @@ def forward( if inputs_embeds is None: inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(self.device) + + torch.save(inputs_embeds, "inputs_embeds_hf.pt") + # START VISUAL INPUTS INTEGRATION if pixel_values is not None and image_hidden_states is not None: raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time") @@ -211,6 +446,8 @@ def forward( patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + torch.save(pixel_values, "pv_hf.pt") + # Get sequence from the vision encoder image_hidden_states = forward_vision( self.vision_model, @@ -218,6 +455,9 @@ def forward( patch_attention_mask=patch_attention_mask, ) + + torch.save(image_hidden_states, "img_hid_hf.pt") + # Modality projection & resampling image_hidden_states = self.connector(image_hidden_states) @@ -248,7 +488,7 @@ def forward( if not return_dict: return tuple(v for v in [*outputs, image_hidden_states] if v is not None) - return None + return outputs @@ -269,17 +509,26 @@ def main(args): text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE) - + print( + model.model.config + ) with torch.no_grad(): # output = model(**inputs) - forward( + output = forward( model.model, use_cache=False, **inputs ) + print(output) + print(output.last_hidden_state.shape) + + torch.save( + output, + "hf_output.pt", + ) if __name__ == "__main__": diff --git a/tools/idefics3/generate_nanotron_predictions.py b/tools/idefics3/generate_nanotron_predictions.py index 4bf01de4..6856bdb2 100644 --- a/tools/idefics3/generate_nanotron_predictions.py +++ b/tools/idefics3/generate_nanotron_predictions.py @@ -145,6 +145,11 @@ def main(args): with torch.no_grad(): output = model.model(**inputs) + torch.save( + output, + "nanotron_output.pt", + ) + if not RANK: predicted_tokens = [5, 27, 34] # Index of the predictions to compare across models term_cols = int(os.get_terminal_size().columns / 3) From 021189c5fc89886f68a2cff5599007f7d99be75b Mon Sep 17 00:00:00 2001 From: kisate Date: Mon, 11 Nov 2024 17:33:29 +0000 Subject: [PATCH 05/45] Fixed vision transformer outputs --- .gitignore | 3 +- compare_outputs.py | 13 +- src/nanotron/models/idefics.py | 82 ++-------- tools/idefics3/generate_hf_predictions.py | 179 +++++++++++++++------- 4 files changed, 150 insertions(+), 127 deletions(-) diff --git a/.gitignore b/.gitignore index 0037d9ae..47e6b7d1 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ cython_debug/ checkpoints/ wandb/ -nanotron-ckpt/* \ No newline at end of file +nanotron-ckpt/* +nanotron_checkpoints/* \ No newline at end of file diff --git a/compare_outputs.py b/compare_outputs.py index 69a64bdb..11e08eb1 100644 --- a/compare_outputs.py +++ b/compare_outputs.py @@ -1,16 +1,21 @@ import torch +layer = 31 hf = torch.load( - f"img_emb_hf_{26}_after_mlp.pt", -).detach().cpu() + f"hf_output_text_llama.pt", +).last_hidden_state.detach().cpu() nano = torch.load( - f"img_emb_nano_{26}_after_mlp.pt", -).detach().cpu() + f"nano_output_text_llama.pt", +).detach().cpu().transpose(0, 1) def compare_outputs(hf, nano): print( hf.shape, nano.shape ) + + if hf.shape != nano.shape: + print("Shapes are different") + return print( f"Max difference between outputs: {torch.max(torch.abs(hf - nano))}" diff --git a/src/nanotron/models/idefics.py b/src/nanotron/models/idefics.py index f8f4e582..de827bf5 100644 --- a/src/nanotron/models/idefics.py +++ b/src/nanotron/models/idefics.py @@ -227,8 +227,6 @@ def forward( batch_size, q_length, _ = qkv_states.shape - torch.save(qkv_states, f"img_emb_nano_{self.layer_idx}_qkv.pt") - if self.is_gqa: query_states, key_states, value_states = torch.split( qkv_states, @@ -272,41 +270,17 @@ def forward( key_states = key_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_qk) value_states = value_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_v) - torch.save(query_states, f"img_emb_nano_{self.layer_idx}_query_states.pt") - torch.save(key_states, f"img_emb_nano_{self.layer_idx}_key_states.pt") - torch.save(value_states, f"img_emb_nano_{self.layer_idx}_value_states.pt") - - # attention_output = self.attention( - # query_states=query_states, - # key_states=key_states, - # value_states=value_states, - # ) + attention_output = self.attention( + query_states=query_states, + key_states=key_states, + value_states=value_states, + ) - # attention_output = ( - # attention_output.contiguous().view(batch_size, q_length, self.n_local_q_heads * self.d_v).transpose(0, 1) - # ) - - attn_output = _flash_attention_forward( - query_states, - key_states, - value_states, - None, - q_length, - dropout=0.0, - sliding_window=getattr(self, "sliding_window", None), - use_top_left_mask=False, - is_causal=False, - # **kwargs, + attention_output = ( + attention_output.contiguous().view(batch_size, q_length, self.n_local_q_heads * self.d_v) ) - - - attn_output = attn_output.contiguous().view(batch_size, q_length, self.d_model) - - - torch.save(attn_output, f"img_emb_nano_{self.layer_idx}_attn_output.pt") - - output = self.o_proj(attn_output) + output = self.o_proj(attention_output) return {"hidden_states": output, "sequence_mask": sequence_mask} @@ -399,34 +373,19 @@ def forward( hidden_states: Union[torch.Tensor, TensorPointer], sequence_mask: Union[torch.Tensor, TensorPointer], ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + residual = hidden_states - torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_before_ln1.pt") - hidden_states = self.layer_norm1(hidden_states) - - torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_ln1.pt") - output = self.self_attn(hidden_states=hidden_states, sequence_mask=sequence_mask) - hidden_states = output["hidden_states"] - torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_attn.pt") - hidden_states = hidden_states + residual - - torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_before_ln2.pt") - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - - torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_ln2.pt") + hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"] - - torch.save(hidden_states, f"img_emb_nano_{self.layer_id}_after_mlp.pt") - hidden_states = hidden_states + residual return { @@ -526,9 +485,6 @@ def forward( patch_attention_mask=patch_attention_mask, )["embeddings"] - - torch.save(hidden_states, "img_emb_nano.pt") - patch_attention_mask = patch_attention_mask.view(batch_size, -1) hidden_encoder_states = { @@ -539,11 +495,9 @@ def forward( for i, encoder_layer in enumerate(self.encoder): hidden_encoder_states = encoder_layer(**hidden_encoder_states) - torch.save(hidden_encoder_states["hidden_states"], f"img_emb_nano_{i}.pt") - hidden_states = hidden_encoder_states["hidden_states"] hidden_states = self.post_layernorm(input=hidden_states) - + return hidden_states def get_block_compute_costs(self): @@ -713,9 +667,7 @@ def inputs_merger( inputs_embeds: Union[torch.Tensor, TensorPointer], image_hidden_states: Union[torch.Tensor, TensorPointer], ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: - print( - inputs_embeds.shape, image_hidden_states.shape - ) + num_images, _, vision_hidden_size = image_hidden_states.shape special_image_token_mask = input_ids == self.image_token_id new_inputs_embeds = inputs_embeds.clone() @@ -756,16 +708,12 @@ def forward( patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) == patch_size * patch_size).bool() - torch.save(pixel_values, "pv_nano.pt") - # Get sequence from the vision encoder image_hidden_states = self.vision_model( pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, )["hidden_states"] - torch.save(image_hidden_states, "img_hid_nano.pt") - # Modality projection & resampling image_hidden_states = self.connector( hidden_states=image_hidden_states @@ -773,8 +721,6 @@ def forward( inputs_embeds = self.llama.token_position_embeddings(input_ids=input_ids, input_mask=input_mask) - torch.save(inputs_embeds["input_embeds"], "input_embeds_nano.pt") - inputs_embeds = self.inputs_merger( input_ids=input_ids, inputs_embeds=inputs_embeds["input_embeds"].transpose(0, 1), @@ -783,8 +729,8 @@ def forward( outputs = self.llama.forward_with_hidden_states( input_ids=None, - input_embeds=inputs_embeds, - input_mask=input_mask, + input_embeds=inputs_embeds.transpose(0, 1), + input_mask=input_mask.transpose(0, 1), ) return outputs diff --git a/tools/idefics3/generate_hf_predictions.py b/tools/idefics3/generate_hf_predictions.py index 1f318827..35c5b021 100644 --- a/tools/idefics3/generate_hf_predictions.py +++ b/tools/idefics3/generate_hf_predictions.py @@ -102,10 +102,6 @@ def forward_attn( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - combined = torch.cat([key_states, value_states, query_states], dim=-1) - - torch.save(combined, f"img_emb_hf_{layer_idx}_qkv.pt") - # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape @@ -150,12 +146,6 @@ def forward_attn( key_states = key_states.to(target_dtype) value_states = value_states.to(target_dtype) - torch.save(query_states, f"img_emb_hf_{layer_idx}_query_states.pt") - torch.save(key_states, f"img_emb_hf_{layer_idx}_key_states.pt") - torch.save(value_states, f"img_emb_hf_{layer_idx}_value_states.pt") - torch.save(attention_mask, f"img_emb_hf_{layer_idx}_attention_mask.pt") - torch.save((q_len, dropout_rate, self.is_causal, self._flash_attn_uses_top_left_mask), f"img_emb_hf_{layer_idx}_args.pt") - attn_output = _flash_attention_forward( query_states, key_states, @@ -167,11 +157,7 @@ def forward_attn( use_top_left_mask=self._flash_attn_uses_top_left_mask, ) - attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous() - - - torch.save(attn_output, f"img_emb_hf_{layer_idx}_attn_output.pt") - + attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous() attn_output = self.out_proj(attn_output) if not output_attentions: @@ -198,13 +184,8 @@ def forward_encoder_layer( returned tensors for more detail. """ residual = hidden_states - - torch.save(hidden_states, f"img_emb_hf_{layer_idx}_before_ln1.pt") - hidden_states = self.layer_norm1(hidden_states) - - torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_ln1.pt") - + hidden_states, attn_weights = forward_attn( self.self_attn, hidden_states=hidden_states, @@ -213,21 +194,14 @@ def forward_encoder_layer( layer_idx=layer_idx, ) - torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_attn.pt") - hidden_states = residual + hidden_states - torch.save(hidden_states, f"img_emb_hf_{layer_idx}_before_ln2.pt") - + residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - - torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_ln2.pt") + hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) - torch.save(hidden_states, f"img_emb_hf_{layer_idx}_after_mlp.pt") - hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -299,8 +273,6 @@ def forward_encoder( hidden_states = layer_outputs[0] - torch.save(hidden_states, f"img_emb_hf_{i}.pt") - if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) @@ -340,8 +312,6 @@ def forward_vision( hidden_states = forward_embedding(self.embeddings, pixel_values=pixel_values, patch_attention_mask=patch_attention_mask) - torch.save(hidden_states, "img_emb_hf.pt") - patch_attention_mask = patch_attention_mask.view(batch_size, -1) # The call to `_upad_input` in `_flash_attention_forward` is expensive # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence), @@ -368,6 +338,125 @@ def forward_vision( return last_hidden_state +def forward_text_model( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + # logger.warning_once( + # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + # ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + # logger.warning_once( + # "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + # "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + # "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + # ) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for i, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return hidden_states, next_cache, all_hidden_states, all_self_attns + def forward( self, @@ -411,9 +500,6 @@ def forward( if inputs_embeds is None: inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(self.device) - - torch.save(inputs_embeds, "inputs_embeds_hf.pt") - # START VISUAL INPUTS INTEGRATION if pixel_values is not None and image_hidden_states is not None: raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time") @@ -446,8 +532,6 @@ def forward( patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - torch.save(pixel_values, "pv_hf.pt") - # Get sequence from the vision encoder image_hidden_states = forward_vision( self.vision_model, @@ -456,8 +540,6 @@ def forward( ) - torch.save(image_hidden_states, "img_hid_hf.pt") - # Modality projection & resampling image_hidden_states = self.connector(image_hidden_states) @@ -474,7 +556,8 @@ def forward( image_hidden_states=image_hidden_states, ) - outputs = self.text_model( + outputs = forward_text_model( + self.text_model, inputs_embeds=inputs_embeds, attention_mask=attention_mask, position_ids=position_ids, @@ -509,10 +592,6 @@ def main(args): text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE) - print( - model.model.config - ) - with torch.no_grad(): # output = model(**inputs) @@ -522,14 +601,6 @@ def main(args): **inputs ) - print(output) - print(output.last_hidden_state.shape) - - torch.save( - output, - "hf_output.pt", - ) - if __name__ == "__main__": _args = get_args() From e23661fb1cf166f8a22b1d4c2d821c148c364684 Mon Sep 17 00:00:00 2001 From: kisate Date: Mon, 25 Nov 2024 02:46:22 +0000 Subject: [PATCH 06/45] tmp dataloader --- run_train.py | 53 +- src/nanotron/dataloader.py | 140 ++++- src/nanotron/models/idefics.py | 38 +- test_stuff.ipynb | 547 ++++++++++++++++++ tools/idefics3/convert_hf_to_nanotron.py | 32 +- tools/idefics3/generate_hf_predictions.py | 27 +- .../idefics3/generate_nanotron_predictions.py | 25 +- 7 files changed, 813 insertions(+), 49 deletions(-) create mode 100644 test_stuff.ipynb diff --git a/run_train.py b/run_train.py index 021d955d..60b33928 100644 --- a/run_train.py +++ b/run_train.py @@ -19,6 +19,7 @@ dummy_infinite_data_generator, get_datasets, get_train_dataloader, + vqa_process, ) from nanotron.helpers import ( compute_remain_train_steps_of_a_data_stage_from_ckp, @@ -32,7 +33,7 @@ try: from huggingface_hub import __version__ as hf_hub_version - from transformers import AutoTokenizer + from transformers import AutoTokenizer, AutoProcessor from transformers import __version__ as tf_version except ImportError: hf_hub_version = None @@ -46,6 +47,7 @@ def get_dataloader_from_data_stage( data: DataArgs, consumed_train_samples: int, num_remaining_train_steps: int, + vqa=False ): """ Returns a dataloader for a given data stage. @@ -96,24 +98,36 @@ def get_dataloader_from_data_stage( splits=data.dataset.hf_dataset_splits, )["train"] - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "left" - - # Check that tokenizer's vocab size is smaller than the model's vocab size - assert ( - tokenizer.vocab_size <= trainer.model_config.vocab_size - ), f"Tokenizer's vocab size ({tokenizer.vocab_size}) is larger than the model's vocab size ({trainer.model_config.vocab_size})" - - # We apply the Causal Language Modeling preprocessing - train_dataset = clm_process( - raw_dataset=raw_dataset, - tokenizer=tokenizer, - text_column_name=data.dataset.text_column_name, - dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process, - dataset_overwrite_cache=data.dataset.dataset_overwrite_cache, - sequence_length=trainer.sequence_length, - ) + + if vqa: + processor = AutoProcessor.from_pretrained(tokenizer_path) + train_dataset = vqa_process( + raw_dataset=raw_dataset, + processor=processor, + dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process, + dataset_overwrite_cache=data.dataset.dataset_overwrite_cache, + sequence_length=trainer.sequence_length, + ) + + else: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + # Check that tokenizer's vocab size is smaller than the model's vocab size + assert ( + tokenizer.vocab_size <= trainer.model_config.vocab_size + ), f"Tokenizer's vocab size ({tokenizer.vocab_size}) is larger than the model's vocab size ({trainer.model_config.vocab_size})" + + # We apply the Causal Language Modeling preprocessing + train_dataset = clm_process( + raw_dataset=raw_dataset, + tokenizer=tokenizer, + text_column_name=data.dataset.text_column_name, + dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process, + dataset_overwrite_cache=data.dataset.dataset_overwrite_cache, + sequence_length=trainer.sequence_length, + ) # We load the processed dataset on the ranks requiring it dataloader = get_train_dataloader( @@ -127,6 +141,7 @@ def get_dataloader_from_data_stage( dataloader_num_workers=data.num_loading_workers, seed_worker=data.seed, dataloader_drop_last=True, + dataset_columns=["input_ids", "pixel_values"] ) # Check if we have enough samples for train_steps diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py index 61f73557..52af9929 100644 --- a/src/nanotron/dataloader.py +++ b/src/nanotron/dataloader.py @@ -29,7 +29,7 @@ concatenate_datasets, load_dataset, ) - from transformers import PreTrainedTokenizerBase + from transformers import PreTrainedTokenizerBase, Idefics3Processor from transformers.trainer_pt_utils import DistributedSamplerWithLoop except ImportError: warnings.warn("Datasets and/or Transformers not installed, you'll be unable to use the dataloader.") @@ -119,6 +119,7 @@ def get_datasets( hf_dataset_or_datasets, hf_dataset_config_name, split=split, + trust_remote_code=True ) else: raise ValueError(f"hf_dataset_or_datasets must be a dict or string but is {type(hf_dataset_or_datasets)}") @@ -323,6 +324,63 @@ def _tokenize_and_group_texts(texts: List[str]) -> Dict[str, List[np.ndarray]]: ) return train_dataset +def vqa_process( + raw_dataset: "Dataset", + processor: "Idefics3Processor", + dataset_processing_num_proc_per_process: int, + dataset_overwrite_cache: bool, + sequence_length: int, +): + def format_example(example): + messages = [] + for i, x in enumerate(example["en"]): + user_message = { + "role": "user", + "content": [ + {"type": "text", "text": x["question"]}, + ] + } + + if i == 0: + user_message["content"].append( + {"type": "image"}, + ) + + messages.append(user_message) + assistant_message = { + "role": "assistant", + "content": [ + {"type": "text", "text": x["answer"]}, + ] + } + + messages.append(assistant_message) + return messages + + def _process_examples(examples: Dict) -> Dict[str, List[np.ndarray]]: + inputs = [ + processor( + text=processor.apply_chat_template(format_example(ex), add_generation_prompt=True), + images = [ex["image"]], + return_tensors="np", max_length=sequence_length + 1, padding="longest", truncation=True + ) + for ex in examples + ] + + return inputs + + train_dataset = raw_dataset.map( + _process_examples, + input_columns="qa", + remove_columns=raw_dataset.column_names, + batched=True, + num_proc=dataset_processing_num_proc_per_process, + load_from_cache_file=not dataset_overwrite_cache, + ) + + return train_dataset + + # Adapted from: https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/data/data_collator.py#L607 @dataclasses.dataclass @@ -398,6 +456,57 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()} return result +class DataCollatorForVQA: + sequence_length: int + input_pp_rank: int + output_pp_rank: int + parallel_context: ParallelContext + + def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + # Process the case when current rank doesn't require data. We return `TensorPointer` that points to ranks having the data. + current_pp_rank = dist.get_rank(self.parallel_context.pp_pg) + if current_pp_rank not in [ + self.input_pp_rank, + self.output_pp_rank, + ]: + assert all(len(example) == 0 for example in examples) + return { + "input_ids": TensorPointer(group_rank=self.input_pp_rank), + "input_mask": TensorPointer(group_rank=self.input_pp_rank), + "label_ids": TensorPointer(group_rank=self.output_pp_rank), + "label_mask": TensorPointer(group_rank=self.output_pp_rank), + "pixel_values": TensorPointer(group_rank=self.input_pp_rank), + } + + # Make sure we load only what's necessary, ie we only load a `input_ids` column. + assert all(list(example.keys()) == ["input_ids", "pixel_values"] for example in examples) + + # TODO @nouamanetazi: Is it better to have examples as np.array or torch.Tensor? + input_ids = np.vstack([examples[i]["input_ids"] for i in range(len(examples))]) + pixel_values = np.vstack([examples[i]["pixel_values"] for i in range(len(examples))]) + + batch_size, expanded_input_length = input_ids.shape + + result: Dict[str, Union[np.ndarray, TensorPointer]] = {} + + result["input_ids"] = TensorPointer(group_rank=self.input_pp_rank) + result["input_mask"] = TensorPointer(group_rank=self.input_pp_rank) + result["label_ids"] = TensorPointer(group_rank=self.output_pp_rank) + result["label_mask"] = TensorPointer(group_rank=self.output_pp_rank) + result["pixel_values"] = TensorPointer(group_rank=self.input_pp_rank) + + if current_pp_rank == self.input_pp_rank: + result["input_ids"] = input_ids[:, :-1] + result["input_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_) + result["pixel_values"] = pixel_values + + if current_pp_rank == self.output_pp_rank: + result["label_ids"] = input_ids[:, 1:] + result["label_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_) + result["pixel_values"] = pixel_values + + result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()} + return result # Adapted from https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L763-L835 def get_sampler( @@ -452,6 +561,7 @@ def get_train_dataloader( dataloader_drop_last: bool = True, dataloader_pin_memory: bool = True, use_loop_to_round_batch_size: bool = False, + dataset_columns = ["input_ids"] ) -> DataLoader: if not isinstance(train_dataset, datasets.Dataset): raise ValueError(f"training requires a datasets.Dataset, but got {type(train_dataset)}") @@ -461,17 +571,17 @@ def get_train_dataloader( input_pp_rank, output_pp_rank, ]: - train_dataset = train_dataset.with_format(type="numpy", columns=["input_ids"], output_all_columns=True) + train_dataset = train_dataset.with_format(type="numpy", columns=dataset_columns, output_all_columns=True) # Case of ranks not requiring data. We give them an infinite dummy dataloader else: # - assert train_dataset.column_names == ["input_ids"], ( - f"Dataset has to have a single column, with `input_ids` as the column name. " + assert train_dataset.column_names == dataset_columns, ( + f"Dataset should only have {dataset_columns} columns" f"Current dataset: {train_dataset}" ) dataset_length = len(train_dataset) - train_dataset = train_dataset.remove_columns(column_names="input_ids") + train_dataset = train_dataset.remove_columns(column_names=dataset_columns) assert ( len(train_dataset) == 0 ), f"Dataset has to be empty after removing the `input_ids` column. Current dataset: {train_dataset}" @@ -480,12 +590,20 @@ def get_train_dataloader( # No need to spawn a lot of workers, we can just use main dataloader_num_workers = 0 - data_collator = DataCollatorForCLM( - sequence_length=sequence_length, - input_pp_rank=input_pp_rank, - output_pp_rank=output_pp_rank, - parallel_context=parallel_context, - ) + if "pixel_values" in dataset_columns: + data_collator = DataCollatorForVQA( + sequence_length=sequence_length, + input_pp_rank=input_pp_rank, + output_pp_rank=output_pp_rank, + parallel_context=parallel_context, + ) + else: + data_collator = DataCollatorForCLM( + sequence_length=sequence_length, + input_pp_rank=input_pp_rank, + output_pp_rank=output_pp_rank, + parallel_context=parallel_context, + ) # Compute size and rank of dataloader workers dp_ranks_size = parallel_context.dp_pg.size() diff --git a/src/nanotron/models/idefics.py b/src/nanotron/models/idefics.py index de827bf5..6778be40 100644 --- a/src/nanotron/models/idefics.py +++ b/src/nanotron/models/idefics.py @@ -639,6 +639,10 @@ def __init__( self.config = config self.image_token_id = config.image_token_id + self.tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE + tp_linear_async_communication = ( + parallel_config.tp_linear_async_communication if parallel_config is not None else False + ) self.llama = LlamaModel( config=config.llama_config, @@ -660,6 +664,32 @@ def __init__( self.image_seq_len = int( ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2) ) + + self.lm_head = PipelineBlock( + p2p=self.llama.p2p, + # Understand that this means that we return sharded logits that are going to need to be gathered + module_builder=TensorParallelColumnLinear, + module_kwargs={ + "in_features": config.llama_config.hidden_size, + "out_features": config.llama_config.vocab_size, + "pg": parallel_context.tp_pg, + "bias": False, + # TODO @thomasw21: refactor so that we store that default in a single place. + "mode": self.tp_mode, + "async_communication": tp_linear_async_communication, + "tp_recompute_allgather": parallel_config.tp_recompute_allgather, + }, + module_input_keys={"x"}, + module_output_keys={"logits"}, + ) + + self.cast_to_fp32 = PipelineBlock( + p2p=self.llama.p2p, + module_builder=lambda: lambda x: x.float(), + module_kwargs={}, + module_input_keys={"x"}, + module_output_keys={"output"}, + ) def inputs_merger( self, @@ -733,7 +763,13 @@ def forward( input_mask=input_mask.transpose(0, 1), ) - return outputs + hidden_states = outputs[1] + + sharded_logits = self.lm_head(x=hidden_states)["logits"] + + fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"] + + return fp32_sharded_logits, hidden_states def get_block_compute_costs(self): llama_cost = self.llama.get_block_compute_costs() diff --git a/test_stuff.ipynb b/test_stuff.ipynb new file mode 100644 index 00000000..5fdd1ea9 --- /dev/null +++ b/test_stuff.ipynb @@ -0,0 +1,547 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.dataloader import (\n", + " clm_process,\n", + " dummy_infinite_data_generator,\n", + " get_datasets,\n", + " get_train_dataloader,\n", + ")\n", + "\n", + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = \"cmarkea/doc-vqa\"\n", + "tokenizer_path = \"robot-test/dummy-tokenizer-wordlevel\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# os.environ[\"HF_ENDPOINT\"] = \"http://localhost:5564\"\n", + "os.environ[\"HF_HUB_ENABLE_HF_TRANSFER\"] = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c01bb81dfd134584ae1b614e1a0f76d5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "README.md: 0%| | 0.00/3.87k [00:00 14\u001b[0m text \u001b[38;5;241m=\u001b[39m processor\u001b[38;5;241m.\u001b[39mapply_chat_template(pretraining_data, add_generation_prompt\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/processing_utils.py:1101\u001b[0m, in \u001b[0;36mProcessorMixin.apply_chat_template\u001b[0;34m(self, conversation, chat_template, tokenize, **kwargs)\u001b[0m\n\u001b[1;32m 1095\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1097\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo chat template is set for this processor. Please either set the `chat_template` attribute, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1098\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor provide a chat template as an argument. See \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1099\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://huggingface.co/docs/transformers/main/en/chat_templating for more information.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1100\u001b[0m )\n\u001b[0;32m-> 1101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39mapply_chat_template(\n\u001b[1;32m 1102\u001b[0m conversation, chat_template\u001b[38;5;241m=\u001b[39mchat_template, tokenize\u001b[38;5;241m=\u001b[39mtokenize, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m 1103\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1869\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.apply_chat_template\u001b[0;34m(self, conversation, tools, documents, chat_template, add_generation_prompt, continue_final_message, tokenize, padding, truncation, max_length, return_tensors, return_dict, return_assistant_tokens_mask, tokenizer_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 1867\u001b[0m all_generation_indices\u001b[38;5;241m.\u001b[39mappend(generation_indices)\n\u001b[1;32m 1868\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1869\u001b[0m rendered_chat \u001b[38;5;241m=\u001b[39m compiled_template\u001b[38;5;241m.\u001b[39mrender(\n\u001b[1;32m 1870\u001b[0m messages\u001b[38;5;241m=\u001b[39mchat,\n\u001b[1;32m 1871\u001b[0m tools\u001b[38;5;241m=\u001b[39mtool_schemas,\n\u001b[1;32m 1872\u001b[0m documents\u001b[38;5;241m=\u001b[39mdocuments,\n\u001b[1;32m 1873\u001b[0m add_generation_prompt\u001b[38;5;241m=\u001b[39madd_generation_prompt,\n\u001b[1;32m 1874\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mtemplate_kwargs,\n\u001b[1;32m 1875\u001b[0m )\n\u001b[1;32m 1876\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m continue_final_message:\n\u001b[1;32m 1877\u001b[0m final_message \u001b[38;5;241m=\u001b[39m chat[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m][\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/jinja2/environment.py:1304\u001b[0m, in \u001b[0;36mTemplate.render\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1302\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mconcat(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mroot_render_func(ctx)) \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m-> 1304\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menvironment\u001b[38;5;241m.\u001b[39mhandle_exception()\n", + "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/jinja2/environment.py:939\u001b[0m, in \u001b[0;36mEnvironment.handle_exception\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 934\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Exception handling helper. This is used internally to either raise\u001b[39;00m\n\u001b[1;32m 935\u001b[0m \u001b[38;5;124;03mrewritten exceptions or return a rendered traceback for the template.\u001b[39;00m\n\u001b[1;32m 936\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 937\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdebug\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m rewrite_traceback_stack\n\u001b[0;32m--> 939\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m rewrite_traceback_stack(source\u001b[38;5;241m=\u001b[39msource)\n", + "File \u001b[0;32m