Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +29 -0
config.json +23 -212
special_tokens_map.json +7 -21
tokenizer_config.json +11 -2

README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+---
+library_name: transformers
+license: apache-2.0
+datasets:
+- HuggingFaceM4/the_cauldron
+- HuggingFaceM4/Docmatix
+pipeline_tag: video-text-to-text
+language:
+- en
+base_model:
+- HuggingFaceTB/SmolLM2-360M-Instruct
+- google/siglip-base-patch16-512
+- HuggingFaceTB/SmolVLM-500M-Instruct
+tags:
+- mlx
+---
+# HuggingFaceTB/SmolVLM2-500M-Video-Instruct-mlx
+This model was converted to MLX format from [`HuggingFaceTB/SmolVLM2-500M-Video-Instruct`]() using mlx-vlm version **0.1.13**.
+Refer to the [original model card](https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct) for more details on the model.
+## Use with mlx
+```bash
+pip install -U mlx-vlm
+```
+```bash
+python -m mlx_vlm.generate --model HuggingFaceTB/SmolVLM2-500M-Video-Instruct-mlx --max-tokens 100 --temp 0.0 --prompt "Describe this image." --image <path_to_image>
+```

config.json CHANGED Viewed

@@ -1,145 +1,27 @@
 {
-    "_attn_implementation_autoset": false,
-    "add_cross_attention": false,
     "architectures": [
         "SmolVLMForConditionalGeneration"
     ],
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "id2label": {
-        "0": "LABEL_0",
-        "1": "LABEL_1"
-    },
     "image_token_id": 49190,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-        "LABEL_0": 0,
-        "LABEL_1": 1
-    },
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "idefics3",
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
     "pad_token_id": 128002,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
     "scale_factor": 4,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
     "text_config": {
-        "vocab_size": 49280,
-        "max_position_embeddings": 8192,
-        "hidden_size": 960,
-        "intermediate_size": 2560,
-        "num_hidden_layers": 32,
-        "num_attention_heads": 15,
-        "num_key_value_heads": 5,
-        "hidden_act": "silu",
-        "initializer_range": 0.02,
-        "rms_norm_eps": 1e-05,
-        "pretraining_tp": 1,
-        "use_cache": true,
-        "rope_theta": 100000,
-        "rope_scaling": null,
-        "attention_bias": false,
-        "attention_dropout": 0.0,
-        "mlp_bias": false,
-        "head_dim": 64,
-        "return_dict": true,
-        "output_hidden_states": false,
-        "output_attentions": false,
-        "torchscript": false,
-        "torch_dtype": "bfloat16",
-        "use_bfloat16": false,
-        "tf_legacy_loss": false,
-        "pruned_heads": {},
-        "tie_word_embeddings": false,
-        "chunk_size_feed_forward": 0,
-        "is_encoder_decoder": false,
-        "is_decoder": false,
-        "cross_attention_hidden_size": null,
-        "add_cross_attention": false,
-        "tie_encoder_decoder": false,
-        "max_length": 20,
-        "min_length": 0,
-        "do_sample": false,
-        "early_stopping": false,
-        "num_beams": 1,
-        "num_beam_groups": 1,
-        "diversity_penalty": 0.0,
-        "temperature": 1.0,
-        "top_k": 50,
-        "top_p": 1.0,
-        "typical_p": 1.0,
-        "repetition_penalty": 1.0,
-        "length_penalty": 1.0,
-        "no_repeat_ngram_size": 0,
-        "encoder_no_repeat_ngram_size": 0,
-        "bad_words_ids": null,
-        "num_return_sequences": 1,
-        "output_scores": false,
-        "return_dict_in_generate": false,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "remove_invalid_values": false,
-        "exponential_decay_length_penalty": null,
-        "suppress_tokens": null,
-        "begin_suppress_tokens": null,
         "architectures": [
             "VLlama3ForCausalLM"
         ],
-        "finetuning_task": null,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "tokenizer_class": null,
-        "prefix": null,
-        "bos_token_id": 1,
-        "pad_token_id": 2,
-        "eos_token_id": 2,
-        "sep_token_id": null,
-        "decoder_start_token_id": null,
-        "task_specific_params": null,
-        "problem_type": null,
-        "_name_or_path": "None",
-        "_attn_implementation_autoset": false,
-        "_flash_attn_2_enabled": true,
         "is_llama_config": true,
         "model_type": "llama",
         "neftune_noise_alpha": 0.0,
         "perceiver_config": {
             "_attn_implementation_autoset": false,
             "_name_or_path": "",
@@ -215,115 +97,44 @@
         },
         "pixel_shuffle_factor": 4,
         "qk_layer_norms": false,
         "rope_interleaved": false,
         "transformers.js_config": {
             "kv_cache_dtype": {
                 "fp16": "float16",
                 "q4f16": "float16"
             }
         },
-        "use_resampler": false
     },
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
     "tie_word_embeddings": false,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
     "torch_dtype": "float32",
-    "torchscript": false,
     "transformers.js_config": {
         "kv_cache_dtype": {
             "fp16": "float16",
             "q4f16": "float16"
         }
     },
-    "transformers_version": "4.49.0.dev0",
-    "typical_p": 1.0,
-    "use_bfloat16": false,
     "use_cache": false,
     "use_reentrant_checkpointing": false,
     "vision_config": {
-        "return_dict": true,
-        "output_hidden_states": false,
-        "output_attentions": false,
-        "torchscript": false,
-        "torch_dtype": null,
-        "use_bfloat16": false,
-        "tf_legacy_loss": false,
-        "pruned_heads": {},
-        "tie_word_embeddings": false,
-        "chunk_size_feed_forward": 0,
-        "is_encoder_decoder": false,
-        "is_decoder": false,
-        "cross_attention_hidden_size": null,
-        "add_cross_attention": false,
-        "tie_encoder_decoder": false,
-        "max_length": 20,
-        "min_length": 0,
-        "do_sample": false,
-        "early_stopping": false,
-        "num_beams": 1,
-        "num_beam_groups": 1,
-        "diversity_penalty": 0.0,
-        "temperature": 1.0,
-        "top_k": 50,
-        "top_p": 1.0,
-        "typical_p": 1.0,
-        "repetition_penalty": 1.0,
-        "length_penalty": 1.0,
-        "no_repeat_ngram_size": 0,
-        "encoder_no_repeat_ngram_size": 0,
-        "bad_words_ids": null,
-        "num_return_sequences": 1,
-        "output_scores": false,
-        "return_dict_in_generate": false,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "remove_invalid_values": false,
-        "exponential_decay_length_penalty": null,
-        "suppress_tokens": null,
-        "begin_suppress_tokens": null,
-        "architectures": null,
-        "finetuning_task": null,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "tokenizer_class": null,
-        "prefix": null,
-        "bos_token_id": null,
-        "pad_token_id": null,
-        "eos_token_id": null,
-        "sep_token_id": null,
-        "decoder_start_token_id": null,
-        "task_specific_params": null,
-        "problem_type": null,
-        "_name_or_path": "",
-        "_attn_implementation_autoset": false,
         "max_image_size": {
             "longest_edge": 512
         },
-        "model_type": "idefics3_vision",
         "size": {
             "longest_edge": 2048
         },
-        "use_base_siglip": false,
-        "hidden_size": 768,
-        "intermediate_size": 3072,
-        "num_hidden_layers": 12,
-        "num_attention_heads": 12,
-        "num_channels": 3,
-        "patch_size": 16,
-        "image_size": 512,
-        "attention_dropout": 0.0,
-        "layer_norm_eps": 1e-06,
-        "hidden_act": "gelu_pytorch_tanh",
-        "initializer_range": 0.02
     },
     "vocab_size": 49280
 }

 {
     "architectures": [
         "SmolVLMForConditionalGeneration"
     ],
     "image_token_id": 49190,
+    "model_type": "smolvlm",
     "pad_token_id": 128002,
     "scale_factor": 4,
     "text_config": {
+        "_flash_attn_2_enabled": true,
+        "_name_or_path": "None",
         "architectures": [
             "VLlama3ForCausalLM"
         ],
+        "head_dim": 64,
+        "hidden_size": 960,
+        "intermediate_size": 2560,
         "is_llama_config": true,
+        "max_position_embeddings": 8192,
         "model_type": "llama",
         "neftune_noise_alpha": 0.0,
+        "num_attention_heads": 15,
+        "num_key_value_heads": 5,
+        "pad_token_id": 2,
         "perceiver_config": {
             "_attn_implementation_autoset": false,
             "_name_or_path": "",
         },
         "pixel_shuffle_factor": 4,
         "qk_layer_norms": false,
+        "rms_norm_eps": 1e-05,
         "rope_interleaved": false,
+        "rope_theta": 100000,
+        "torch_dtype": "bfloat16",
         "transformers.js_config": {
             "kv_cache_dtype": {
                 "fp16": "float16",
                 "q4f16": "float16"
             }
         },
+        "use_resampler": false,
+        "vocab_size": 49280
     },
     "tie_word_embeddings": false,
     "torch_dtype": "float32",
     "transformers.js_config": {
         "kv_cache_dtype": {
             "fp16": "float16",
             "q4f16": "float16"
         }
     },
+    "transformers_version": "4.47.1",
     "use_cache": false,
     "use_reentrant_checkpointing": false,
     "vision_config": {
+        "hidden_size": 768,
+        "image_size": 512,
         "max_image_size": {
             "longest_edge": 512
         },
+        "model_type": "smolvlm_vision",
+        "num_attention_heads": 12,
+        "patch_size": 16,
         "size": {
             "longest_edge": 2048
         },
+        "tie_word_embeddings": false,
+        "use_base_siglip": false
     },
     "vocab_size": 49280
 }

special_tokens_map.json CHANGED Viewed

@@ -1,26 +1,8 @@
 {
   "additional_special_tokens": [
-    {
-      "content": "<fake_token_around_image>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<image>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<end_of_utterance>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    }
   ],
   "bos_token": {
     "content": "<|im_start|>",
@@ -29,6 +11,7 @@
     "rstrip": false,
     "single_word": false
   },
   "eos_token": {
     "content": "<end_of_utterance>",
     "lstrip": false,
@@ -36,6 +19,9 @@
     "rstrip": false,
     "single_word": false
   },
   "pad_token": {
     "content": "<|im_end|>",
     "lstrip": false,

 {
   "additional_special_tokens": [
+    "<fake_token_around_image>",
+    "<image>",
+    "<end_of_utterance>"
   ],
   "bos_token": {
     "content": "<|im_start|>",
     "rstrip": false,
     "single_word": false
   },
+  "end_of_utterance_token": "<end_of_utterance>",
   "eos_token": {
     "content": "<end_of_utterance>",
     "lstrip": false,
     "rstrip": false,
     "single_word": false
   },
+  "fake_image_token": "<fake_token_around_image>",
+  "global_image_token": "<global-img>",
+  "image_token": "<image>",
   "pad_token": {
     "content": "<|im_end|>",
     "lstrip": false,

tokenizer_config.json CHANGED Viewed

@@ -1170,12 +1170,21 @@
   "bos_token": "<|im_start|>",
   "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<end_of_utterance>",
-  "extra_special_tokens": {},
   "legacy": false,
   "model_max_length": 8192,
   "pad_token": "<|im_end|>",
-  "processor_class": "Idefics3Processor",
   "tokenizer_class": "GPT2Tokenizer",
   "truncation_side": "left",
   "unk_token": "<|endoftext|>",

   "bos_token": "<|im_start|>",
   "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
+  "end_of_utterance_token": "<end_of_utterance>",
   "eos_token": "<end_of_utterance>",
+  "extra_special_tokens": {
+    "end_of_utterance_token": "<end_of_utterance>",
+    "fake_image_token": "<fake_token_around_image>",
+    "global_image_token": "<global-img>",
+    "image_token": "<image>"
+  },
+  "fake_image_token": "<fake_token_around_image>",
+  "global_image_token": "<global-img>",
+  "image_token": "<image>",
   "legacy": false,
   "model_max_length": 8192,
   "pad_token": "<|im_end|>",
+  "processor_class": "SmolVLMProcessor",
   "tokenizer_class": "GPT2Tokenizer",
   "truncation_side": "left",
   "unk_token": "<|endoftext|>",