fix compatibility issue for transformers 4.46+
Browse files- README.md +9 -8
- configuration_internvl_chat.py +2 -2
- modeling_intern_vit.py +1 -0
README.md
CHANGED
@@ -5,6 +5,7 @@ library_name: transformers
|
|
5 |
base_model:
|
6 |
- OpenGVLab/InternViT-300M-448px
|
7 |
- microsoft/Phi-3-mini-128k-instruct
|
|
|
8 |
base_model_relation: merge
|
9 |
language:
|
10 |
- multilingual
|
@@ -19,7 +20,7 @@ tags:
|
|
19 |
|
20 |
# Mini-InternVL-Chat-4B-V1-5
|
21 |
|
22 |
-
[\[π GitHub\]](https://github.com/OpenGVLab/InternVL) [\[π Blog\]](https://internvl.github.io/blog/) [\[π InternVL 1.0
|
23 |
|
24 |
[\[π¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[π€ HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[π Quick Start\]](#quick-start) [\[π δΈζ解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[π Documents\]](https://internvl.readthedocs.io/en/latest/)
|
25 |
|
@@ -69,7 +70,7 @@ We provide an example code to run Mini-InternVL-Chat-4B-V1-5 using `transformers
|
|
69 |
|
70 |
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
71 |
|
72 |
-
> Please use transformers
|
73 |
|
74 |
### Model Loading
|
75 |
|
@@ -379,7 +380,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
|
|
379 |
print(f'User: {question}\nAssistant: {response}')
|
380 |
```
|
381 |
|
382 |
-
#### Streaming
|
383 |
|
384 |
Besides this method, you can also use the following code to get streamed output.
|
385 |
|
@@ -419,12 +420,12 @@ Many repositories now support fine-tuning of the InternVL series models, includi
|
|
419 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
420 |
|
421 |
```sh
|
422 |
-
pip install lmdeploy
|
423 |
```
|
424 |
|
425 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
426 |
|
427 |
-
#### A 'Hello, world'
|
428 |
|
429 |
```python
|
430 |
from lmdeploy import pipeline, TurbomindEngineConfig
|
@@ -439,7 +440,7 @@ print(response.text)
|
|
439 |
|
440 |
If `ImportError` occurs while executing this case, please install the required dependency packages as prompted.
|
441 |
|
442 |
-
#### Multi-images
|
443 |
|
444 |
When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
|
445 |
|
@@ -464,7 +465,7 @@ response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe thes
|
|
464 |
print(response.text)
|
465 |
```
|
466 |
|
467 |
-
#### Batch
|
468 |
|
469 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
470 |
|
@@ -484,7 +485,7 @@ response = pipe(prompts)
|
|
484 |
print(response)
|
485 |
```
|
486 |
|
487 |
-
#### Multi-turn
|
488 |
|
489 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
490 |
|
|
|
5 |
base_model:
|
6 |
- OpenGVLab/InternViT-300M-448px
|
7 |
- microsoft/Phi-3-mini-128k-instruct
|
8 |
+
new_version: OpenGVLab/InternVL2_5-4B
|
9 |
base_model_relation: merge
|
10 |
language:
|
11 |
- multilingual
|
|
|
20 |
|
21 |
# Mini-InternVL-Chat-4B-V1-5
|
22 |
|
23 |
+
[\[π GitHub\]](https://github.com/OpenGVLab/InternVL) [\[π Blog\]](https://internvl.github.io/blog/) [\[π InternVL 1.0\]](https://arxiv.org/abs/2312.14238) [\[π InternVL 1.5\]](https://arxiv.org/abs/2404.16821) [\[π Mini-InternVL\]](https://arxiv.org/abs/2410.16261)
|
24 |
|
25 |
[\[π¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[π€ HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[π Quick Start\]](#quick-start) [\[π δΈζ解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[π Documents\]](https://internvl.readthedocs.io/en/latest/)
|
26 |
|
|
|
70 |
|
71 |
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
72 |
|
73 |
+
> Please use transformers>=4.37.2 to ensure the model works normally.
|
74 |
|
75 |
### Model Loading
|
76 |
|
|
|
380 |
print(f'User: {question}\nAssistant: {response}')
|
381 |
```
|
382 |
|
383 |
+
#### Streaming Output
|
384 |
|
385 |
Besides this method, you can also use the following code to get streamed output.
|
386 |
|
|
|
420 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
421 |
|
422 |
```sh
|
423 |
+
pip install lmdeploy>=0.5.3
|
424 |
```
|
425 |
|
426 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
427 |
|
428 |
+
#### A 'Hello, world' Example
|
429 |
|
430 |
```python
|
431 |
from lmdeploy import pipeline, TurbomindEngineConfig
|
|
|
440 |
|
441 |
If `ImportError` occurs while executing this case, please install the required dependency packages as prompted.
|
442 |
|
443 |
+
#### Multi-images Inference
|
444 |
|
445 |
When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
|
446 |
|
|
|
465 |
print(response.text)
|
466 |
```
|
467 |
|
468 |
+
#### Batch Prompts Inference
|
469 |
|
470 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
471 |
|
|
|
485 |
print(response)
|
486 |
```
|
487 |
|
488 |
+
#### Multi-turn Conversation
|
489 |
|
490 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
491 |
|
configuration_internvl_chat.py
CHANGED
@@ -39,11 +39,11 @@ class InternVLChatConfig(PretrainedConfig):
|
|
39 |
super().__init__(**kwargs)
|
40 |
|
41 |
if vision_config is None:
|
42 |
-
vision_config = {}
|
43 |
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
|
44 |
|
45 |
if llm_config is None:
|
46 |
-
llm_config = {}
|
47 |
logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
|
48 |
|
49 |
self.vision_config = InternVisionConfig(**vision_config)
|
|
|
39 |
super().__init__(**kwargs)
|
40 |
|
41 |
if vision_config is None:
|
42 |
+
vision_config = {'architectures': ['InternVisionModel']}
|
43 |
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
|
44 |
|
45 |
if llm_config is None:
|
46 |
+
llm_config = {'architectures': ['Phi3ForCausalLM']}
|
47 |
logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
|
48 |
|
49 |
self.vision_config = InternVisionConfig(**vision_config)
|
modeling_intern_vit.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
|
|
6 |
from typing import Optional, Tuple, Union
|
7 |
|
8 |
import torch
|
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
+
|
7 |
from typing import Optional, Tuple, Union
|
8 |
|
9 |
import torch
|