fix compatibility issue for transformers 4.46+
Browse files- README.md +15 -8
- configuration_internvl_chat.py +2 -2
- modeling_intern_vit.py +1 -0
README.md
CHANGED
@@ -5,6 +5,7 @@ library_name: transformers
|
|
5 |
base_model:
|
6 |
- OpenGVLab/InternViT-6B-448px-V1-5
|
7 |
- internlm/internlm2-chat-20b
|
|
|
8 |
base_model_relation: merge
|
9 |
language:
|
10 |
- multilingual
|
@@ -19,7 +20,7 @@ tags:
|
|
19 |
|
20 |
# InternVL-Chat-V1-5
|
21 |
|
22 |
-
[\[π GitHub\]](https://github.com/OpenGVLab/InternVL) [\[π Blog\]](https://internvl.github.io/blog/) [\[π InternVL 1.0
|
23 |
|
24 |
[\[π¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[π€ HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[π Quick Start\]](#quick-start) [\[π δΈζ解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[π Documents\]](https://internvl.readthedocs.io/en/latest/)
|
25 |
|
@@ -86,7 +87,7 @@ We provide an example code to run InternVL-Chat-V1-5 using `transformers`.
|
|
86 |
|
87 |
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
88 |
|
89 |
-
> Please use transformers
|
90 |
|
91 |
### Model Loading
|
92 |
|
@@ -386,7 +387,7 @@ response, history = model.chat(tokenizer, pixel_values, question, generation_con
|
|
386 |
print(f'User: {question}\nAssistant: {response}')
|
387 |
```
|
388 |
|
389 |
-
#### Streaming
|
390 |
|
391 |
Besides this method, you can also use the following code to get streamed output.
|
392 |
|
@@ -426,12 +427,12 @@ Many repositories now support fine-tuning of the InternVL series models, includi
|
|
426 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
427 |
|
428 |
```sh
|
429 |
-
pip install lmdeploy
|
430 |
```
|
431 |
|
432 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
433 |
|
434 |
-
#### A 'Hello, world'
|
435 |
|
436 |
```python
|
437 |
from lmdeploy import pipeline, TurbomindEngineConfig
|
@@ -446,7 +447,7 @@ print(response.text)
|
|
446 |
|
447 |
If `ImportError` occurs while executing this case, please install the required dependency packages as prompted.
|
448 |
|
449 |
-
#### Multi-images
|
450 |
|
451 |
When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
|
452 |
|
@@ -471,7 +472,7 @@ response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe thes
|
|
471 |
print(response.text)
|
472 |
```
|
473 |
|
474 |
-
#### Batch
|
475 |
|
476 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
477 |
|
@@ -491,7 +492,7 @@ response = pipe(prompts)
|
|
491 |
print(response)
|
492 |
```
|
493 |
|
494 |
-
#### Multi-turn
|
495 |
|
496 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
497 |
|
@@ -561,6 +562,12 @@ This project is released under the MIT license, while InternLM2 is licensed unde
|
|
561 |
If you find this project useful in your research, please consider citing:
|
562 |
|
563 |
```BibTeX
|
|
|
|
|
|
|
|
|
|
|
|
|
564 |
@article{chen2023internvl,
|
565 |
title={InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks},
|
566 |
author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and Li, Bin and Luo, Ping and Lu, Tong and Qiao, Yu and Dai, Jifeng},
|
|
|
5 |
base_model:
|
6 |
- OpenGVLab/InternViT-6B-448px-V1-5
|
7 |
- internlm/internlm2-chat-20b
|
8 |
+
new_version: OpenGVLab/InternVL2_5-26B
|
9 |
base_model_relation: merge
|
10 |
language:
|
11 |
- multilingual
|
|
|
20 |
|
21 |
# InternVL-Chat-V1-5
|
22 |
|
23 |
+
[\[π GitHub\]](https://github.com/OpenGVLab/InternVL) [\[π Blog\]](https://internvl.github.io/blog/) [\[π InternVL 1.0\]](https://arxiv.org/abs/2312.14238) [\[π InternVL 1.5\]](https://arxiv.org/abs/2404.16821) [\[π Mini-InternVL\]](https://arxiv.org/abs/2410.16261)
|
24 |
|
25 |
[\[π¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[π€ HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[π Quick Start\]](#quick-start) [\[π δΈζ解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[π Documents\]](https://internvl.readthedocs.io/en/latest/)
|
26 |
|
|
|
87 |
|
88 |
We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
|
89 |
|
90 |
+
> Please use transformers>=4.37.2 to ensure the model works normally.
|
91 |
|
92 |
### Model Loading
|
93 |
|
|
|
387 |
print(f'User: {question}\nAssistant: {response}')
|
388 |
```
|
389 |
|
390 |
+
#### Streaming Output
|
391 |
|
392 |
Besides this method, you can also use the following code to get streamed output.
|
393 |
|
|
|
427 |
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the MMRazor and MMDeploy teams.
|
428 |
|
429 |
```sh
|
430 |
+
pip install lmdeploy>=0.5.3
|
431 |
```
|
432 |
|
433 |
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
|
434 |
|
435 |
+
#### A 'Hello, world' Example
|
436 |
|
437 |
```python
|
438 |
from lmdeploy import pipeline, TurbomindEngineConfig
|
|
|
447 |
|
448 |
If `ImportError` occurs while executing this case, please install the required dependency packages as prompted.
|
449 |
|
450 |
+
#### Multi-images Inference
|
451 |
|
452 |
When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
|
453 |
|
|
|
472 |
print(response.text)
|
473 |
```
|
474 |
|
475 |
+
#### Batch Prompts Inference
|
476 |
|
477 |
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
|
478 |
|
|
|
492 |
print(response)
|
493 |
```
|
494 |
|
495 |
+
#### Multi-turn Conversation
|
496 |
|
497 |
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
|
498 |
|
|
|
562 |
If you find this project useful in your research, please consider citing:
|
563 |
|
564 |
```BibTeX
|
565 |
+
@article{gao2024mini,
|
566 |
+
title={Mini-internvl: A flexible-transfer pocket multimodal model with 5\% parameters and 90\% performance},
|
567 |
+
author={Gao, Zhangwei and Chen, Zhe and Cui, Erfei and Ren, Yiming and Wang, Weiyun and Zhu, Jinguo and Tian, Hao and Ye, Shenglong and He, Junjun and Zhu, Xizhou and others},
|
568 |
+
journal={arXiv preprint arXiv:2410.16261},
|
569 |
+
year={2024}
|
570 |
+
}
|
571 |
@article{chen2023internvl,
|
572 |
title={InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks},
|
573 |
author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and Li, Bin and Luo, Ping and Lu, Tong and Qiao, Yu and Dai, Jifeng},
|
configuration_internvl_chat.py
CHANGED
@@ -39,11 +39,11 @@ class InternVLChatConfig(PretrainedConfig):
|
|
39 |
super().__init__(**kwargs)
|
40 |
|
41 |
if vision_config is None:
|
42 |
-
vision_config = {}
|
43 |
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
|
44 |
|
45 |
if llm_config is None:
|
46 |
-
llm_config = {}
|
47 |
logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
|
48 |
|
49 |
self.vision_config = InternVisionConfig(**vision_config)
|
|
|
39 |
super().__init__(**kwargs)
|
40 |
|
41 |
if vision_config is None:
|
42 |
+
vision_config = {'architectures': ['InternVisionModel']}
|
43 |
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
|
44 |
|
45 |
if llm_config is None:
|
46 |
+
llm_config = {'architectures': ['InternLM2ForCausalLM']}
|
47 |
logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
|
48 |
|
49 |
self.vision_config = InternVisionConfig(**vision_config)
|
modeling_intern_vit.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
|
|
6 |
from typing import Optional, Tuple, Union
|
7 |
|
8 |
import torch
|
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
+
|
7 |
from typing import Optional, Tuple, Union
|
8 |
|
9 |
import torch
|