OpenGVLab
/

InternVL2-Llama3-76B

Image-Text-to-Text

feature-extraction

Model card Files Files and versions Community

czczup commited on 18 days ago

Commit

859659b

•

1 Parent(s): d044c87

Apply Github PR #127 to all HF models

Files changed (1) hide show

modeling_internvl_chat.py +2 -2

modeling_internvl_chat.py CHANGED Viewed

@@ -41,7 +41,7 @@ class InternVLChatModel(PreTrainedModel):
     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
         super().__init__(config)
-        assert version_cmp(transformers.__version__, '4.36.2', 'ge')
         image_size = config.force_image_size or config.vision_config.image_size
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
@@ -108,7 +108,7 @@ class InternVLChatModel(PreTrainedModel):
         B, N, C = input_embeds.shape
         input_embeds = input_embeds.reshape(B * N, C)
-        if torch.distributed.get_rank() == 0:
             print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
         input_ids = input_ids.reshape(B * N)

     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
         super().__init__(config)
+        assert version_cmp(transformers.__version__, '4.37.0', 'ge')
         image_size = config.force_image_size or config.vision_config.image_size
         patch_size = config.vision_config.patch_size
         self.patch_size = patch_size
         B, N, C = input_embeds.shape
         input_embeds = input_embeds.reshape(B * N, C)
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
             print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
         input_ids = input_ids.reshape(B * N)