Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/SoybeanMilk/whisper-webui-translate
Browse files1.Thank you SoybeanMilk for assisting in the development and integration of the excellent madlad400 translation model.
2. Added translationTorchDtypeFloat16 Checkbox to confirm if the non-quantized Translation models are set to Torch Dtype float16.
- transformers: torch_dtype=torch.float16
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, Not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
3. Added non-quantized versions of ALMA and madlad400 options in the translation tab menu. They will be displayed only in environments where GPU support is available.
- app.py +12 -9
- config.json5 +27 -0
- docs/options.md +5 -1
- docs/translateModel.md +47 -2
- src/config.py +2 -0
- src/translation/translationModel.py +52 -14
app.py
CHANGED
@@ -236,9 +236,10 @@ class WhisperTranscriber:
|
|
236 |
madlad400ModelName: str = decodeOptions.pop("madlad400ModelName")
|
237 |
madlad400LangName: str = decodeOptions.pop("madlad400LangName")
|
238 |
|
239 |
-
translationBatchSize: int
|
240 |
-
translationNoRepeatNgramSize: int
|
241 |
-
translationNumBeams: int
|
|
|
242 |
|
243 |
sourceInput: str = decodeOptions.pop("sourceInput")
|
244 |
urlData: str = decodeOptions.pop("urlData")
|
@@ -367,16 +368,16 @@ class WhisperTranscriber:
|
|
367 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
|
368 |
translationLang = get_lang_from_m2m100_name(mt5LangName)
|
369 |
elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
|
370 |
-
selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-
|
371 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
|
372 |
translationLang = get_lang_from_m2m100_name(ALMALangName)
|
373 |
elif translateInput == "madlad400" and madlad400LangName is not None and len(madlad400LangName) > 0:
|
374 |
-
selectedModelName = madlad400ModelName if madlad400ModelName is not None and len(madlad400ModelName) > 0 else "madlad400-
|
375 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
|
376 |
translationLang = get_lang_from_m2m100_name(madlad400LangName)
|
377 |
|
378 |
if translationLang is not None:
|
379 |
-
translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
|
380 |
|
381 |
progress(0, desc="init transcribe")
|
382 |
# Result
|
@@ -936,8 +937,9 @@ def create_ui(app_config: ApplicationConfig):
|
|
936 |
mt5_models = app_config.get_model_names("mt5")
|
937 |
ALMA_models = app_config.get_model_names("ALMA")
|
938 |
madlad400_models = app_config.get_model_names("madlad400")
|
939 |
-
if not torch.cuda.is_available(): #
|
940 |
-
ALMA_models = list(filter(lambda alma: "
|
|
|
941 |
|
942 |
common_whisper_inputs = lambda : {
|
943 |
gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
|
@@ -967,7 +969,8 @@ def create_ui(app_config: ApplicationConfig):
|
|
967 |
common_translation_inputs = lambda : {
|
968 |
gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
|
969 |
gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
|
970 |
-
gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams")
|
|
|
971 |
}
|
972 |
|
973 |
common_vad_inputs = lambda : {
|
|
|
236 |
madlad400ModelName: str = decodeOptions.pop("madlad400ModelName")
|
237 |
madlad400LangName: str = decodeOptions.pop("madlad400LangName")
|
238 |
|
239 |
+
translationBatchSize: int = decodeOptions.pop("translationBatchSize")
|
240 |
+
translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
|
241 |
+
translationNumBeams: int = decodeOptions.pop("translationNumBeams")
|
242 |
+
translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
|
243 |
|
244 |
sourceInput: str = decodeOptions.pop("sourceInput")
|
245 |
urlData: str = decodeOptions.pop("urlData")
|
|
|
368 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
|
369 |
translationLang = get_lang_from_m2m100_name(mt5LangName)
|
370 |
elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
|
371 |
+
selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-7B-ct2:int8_float16/avan"
|
372 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
|
373 |
translationLang = get_lang_from_m2m100_name(ALMALangName)
|
374 |
elif translateInput == "madlad400" and madlad400LangName is not None and len(madlad400LangName) > 0:
|
375 |
+
selectedModelName = madlad400ModelName if madlad400ModelName is not None and len(madlad400ModelName) > 0 else "madlad400-3b-mt-ct2-int8_float16/SoybeanMilk"
|
376 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
|
377 |
translationLang = get_lang_from_m2m100_name(madlad400LangName)
|
378 |
|
379 |
if translationLang is not None:
|
380 |
+
translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16)
|
381 |
|
382 |
progress(0, desc="init transcribe")
|
383 |
# Result
|
|
|
937 |
mt5_models = app_config.get_model_names("mt5")
|
938 |
ALMA_models = app_config.get_model_names("ALMA")
|
939 |
madlad400_models = app_config.get_model_names("madlad400")
|
940 |
+
if not torch.cuda.is_available(): #Load only GGUF and CT2 translation models in pure CPU environments..
|
941 |
+
ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
|
942 |
+
madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
|
943 |
|
944 |
common_whisper_inputs = lambda : {
|
945 |
gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
|
|
|
969 |
common_translation_inputs = lambda : {
|
970 |
gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
|
971 |
gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
|
972 |
+
gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
|
973 |
+
gr.Checkbox(label="Translation - Torch Dtype float16", info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF)", value=app_config.translation_torch_dtype_float16, elem_id="translationTorchDtypeFloat16")
|
974 |
}
|
975 |
|
976 |
common_vad_inputs = lambda : {
|
config.json5
CHANGED
@@ -229,6 +229,16 @@
|
|
229 |
"type": "huggingface",
|
230 |
"tokenizer_url": "haoranxu/ALMA-13B"
|
231 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
],
|
233 |
"madlad400": [
|
234 |
{
|
@@ -243,6 +253,21 @@
|
|
243 |
"type": "huggingface",
|
244 |
"tokenizer_url": "jbochi/madlad400-10b-mt"
|
245 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
]
|
247 |
},
|
248 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
@@ -373,4 +398,6 @@
|
|
373 |
"translation_no_repeat_ngram_size": 3,
|
374 |
// Translation - Beam size (1 for greedy search).
|
375 |
"translation_num_beams": 2,
|
|
|
|
|
376 |
}
|
|
|
229 |
"type": "huggingface",
|
230 |
"tokenizer_url": "haoranxu/ALMA-13B"
|
231 |
},
|
232 |
+
{
|
233 |
+
"name": "ALMA-7B/haoranxu",
|
234 |
+
"url": "haoranxu/ALMA-7B",
|
235 |
+
"type": "huggingface"
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"name": "ALMA-13B/haoranxu",
|
239 |
+
"url": "haoranxu/ALMA-13B",
|
240 |
+
"type": "huggingface"
|
241 |
+
},
|
242 |
],
|
243 |
"madlad400": [
|
244 |
{
|
|
|
253 |
"type": "huggingface",
|
254 |
"tokenizer_url": "jbochi/madlad400-10b-mt"
|
255 |
},
|
256 |
+
{
|
257 |
+
"name": "madlad400-3b-mt/jbochi",
|
258 |
+
"url": "jbochi/madlad400-3b-mt",
|
259 |
+
"type": "huggingface",
|
260 |
+
},
|
261 |
+
{
|
262 |
+
"name": "madlad400-7b-mt-bt/jbochi",
|
263 |
+
"url": "jbochi/madlad400-7b-mt-bt",
|
264 |
+
"type": "huggingface",
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"name": "madlad400-10b-mt/jbochi",
|
268 |
+
"url": "jbochi/madlad400-10b-mt",
|
269 |
+
"type": "huggingface",
|
270 |
+
},
|
271 |
]
|
272 |
},
|
273 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
|
|
398 |
"translation_no_repeat_ngram_size": 3,
|
399 |
// Translation - Beam size (1 for greedy search).
|
400 |
"translation_num_beams": 2,
|
401 |
+
// Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
|
402 |
+
"translation_torch_dtype_float16": true,
|
403 |
}
|
docs/options.md
CHANGED
@@ -200,4 +200,8 @@ Prevent repetitions of ngrams with this size (set 0 to disable).
|
|
200 |
- transformers: num_beams
|
201 |
Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
|
202 |
- ctranslate2: beam_size
|
203 |
-
Beam size (1 for greedy search).
|
|
|
|
|
|
|
|
|
|
200 |
- transformers: num_beams
|
201 |
Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
|
202 |
- ctranslate2: beam_size
|
203 |
+
Beam size (1 for greedy search).
|
204 |
+
|
205 |
+
## Translation - Torch Dtype float16
|
206 |
+
- transformers: torch_dtype=torch.float16
|
207 |
+
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
|
docs/translateModel.md
CHANGED
@@ -42,6 +42,7 @@ NLLB-200 is a multilingual translation model introduced by Meta AI in July 2022.
|
|
42 |
| [facebook/nllb-200-distilled-1.3B](https://huggingface.co/facebook/nllb-200-distilled-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.9 GB |
|
43 |
| [facebook/nllb-200-1.3B](https://huggingface.co/facebook/nllb-200-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.8 GB |
|
44 |
| [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B) | 3.3B | 17.58 GB | float32 | ≈13.4 GB |
|
|
|
45 |
|
46 |
## NLLB-200-CTranslate2
|
47 |
|
@@ -78,8 +79,8 @@ The official support for ALMA currently includes 10 language directions: English
|
|
78 |
|
79 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
80 |
|------|------------|------|---------------|---------------|
|
81 |
-
| [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | 7B | 26.95 GB | float32 |
|
82 |
-
| [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | 13B | 52.07 GB | float32 |
|
83 |
|
84 |
## ALMA-GPTQ
|
85 |
|
@@ -111,6 +112,46 @@ GGUF is a file format for storing models for inference with GGML and executors b
|
|
111 |
| [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
|
112 |
| [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
# Options
|
116 |
|
@@ -131,3 +172,7 @@ Prevent repetitions of ngrams with this size (set 0 to disable).
|
|
131 |
Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
|
132 |
- ctranslate2: beam_size
|
133 |
Beam size (1 for greedy search).
|
|
|
|
|
|
|
|
|
|
42 |
| [facebook/nllb-200-distilled-1.3B](https://huggingface.co/facebook/nllb-200-distilled-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.9 GB |
|
43 |
| [facebook/nllb-200-1.3B](https://huggingface.co/facebook/nllb-200-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.8 GB |
|
44 |
| [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B) | 3.3B | 17.58 GB | float32 | ≈13.4 GB |
|
45 |
+
| [facebook/nllb-moe-54b](https://huggingface.co/facebook/nllb-moe-54b) | 54B | 220.2 GB | float32 | N/A |
|
46 |
|
47 |
## NLLB-200-CTranslate2
|
48 |
|
|
|
79 |
|
80 |
| Name | Parameters | Size | type/quantize | Required VRAM |
|
81 |
|------|------------|------|---------------|---------------|
|
82 |
+
| [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | 7B | 26.95 GB | float32 | ≈13.2 GB (torch dtype in float16) |
|
83 |
+
| [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | 13B | 52.07 GB | float32 | ≈25.4 GB (torch dtype in float16) |
|
84 |
|
85 |
## ALMA-GPTQ
|
86 |
|
|
|
112 |
| [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
|
113 |
| [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
|
114 |
|
115 |
+
## madlad400
|
116 |
+
|
117 |
+
madlad400 is a multilingual machine translation model based on the T5 architecture introduced by Google DeepMind, Google Research in Sep 2023. It was trained on 250 billion tokens covering over 450 languages using publicly available data. The paper is titled "`MADLAD-400: A Multilingual And Document-Level Large Audited Dataset`" ([arXiv:2309.04662](https://arxiv.org/abs/2309.04662)).
|
118 |
+
|
119 |
+
| Name | Parameters | Size | type/quantize | Required VRAM |
|
120 |
+
|------|------------|------|---------------|---------------|
|
121 |
+
| [jbochi/madlad400-3b-mt](https://huggingface.co/jbochi/madlad400-3b-mt) | 3B | 11.8 GB | float32 | ≈12 GB |
|
122 |
+
| [jbochi/madlad400-7b-mt](https://huggingface.co/jbochi/madlad400-7b-mt) | 7.2B | 33.2 GB | float32 | ≈19.7 GB (torch dtype in float16) |
|
123 |
+
| [jbochi/madlad400-7b-mt-bt](https://huggingface.co/jbochi/madlad400-7b-mt-bt) | 7.2B | 33.2 GB | float32 (finetuned on backtranslated data) | ≈19.7 GB (torch dtype in float16) |
|
124 |
+
| [jbochi/madlad400-8b-lm](https://huggingface.co/jbochi/madlad400-8b-lm) | 8B | 34.52 GB | float32 | N/A |
|
125 |
+
| [jbochi/madlad400-10b-mt](https://huggingface.co/jbochi/madlad400-10b-mt) | 10.7B | 42.86 GB | float32 | ≈24.3 GB (torch dtype in float16) |
|
126 |
+
|
127 |
+
## madlad400-CTranslate2
|
128 |
+
|
129 |
+
| Name | Parameters | Size | type/quantize | Required VRAM |
|
130 |
+
|------|------------|------|---------------|---------------|
|
131 |
+
| [SoybeanMilk/madlad400-3b-mt-ct2-int8_float16](https://huggingface.co/SoybeanMilk/madlad400-3b-mt-ct2-int8_float16) | 3B | 2.95 GB | int8_float16 | ≈2.7 GB |
|
132 |
+
| [SoybeanMilk/madlad400-10b-mt-ct2-int8_float16](https://huggingface.co/SoybeanMilk/madlad400-10b-mt-ct2-int8_float16) | 10.7B | 10.7 GB | int8_float16 | ≈10 GB |
|
133 |
+
|
134 |
+
## SeamlessM4T
|
135 |
+
|
136 |
+
SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
137 |
+
|
138 |
+
It enables multiple tasks without relying on separate models:
|
139 |
+
|
140 |
+
Speech-to-speech translation (S2ST)
|
141 |
+
Speech-to-text translation (S2TT)
|
142 |
+
Text-to-speech translation (T2ST)
|
143 |
+
Text-to-text translation (T2TT)
|
144 |
+
Automatic speech recognition (ASR)
|
145 |
+
|
146 |
+
SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
|
147 |
+
SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
|
148 |
+
|
149 |
+
| Name | Parameters | Size | type/quantize | Required VRAM |
|
150 |
+
|------|------------|------|---------------|---------------|
|
151 |
+
| [facebook/hf-seamless-m4t-medium](https://huggingface.co/facebook/hf-seamless-m4t-medium) | 1.2B | 4.84 GB | float32 | N/A |
|
152 |
+
| [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
|
153 |
+
| [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | N/A |
|
154 |
+
|
155 |
|
156 |
# Options
|
157 |
|
|
|
172 |
Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
|
173 |
- ctranslate2: beam_size
|
174 |
Beam size (1 for greedy search).
|
175 |
+
|
176 |
+
## Translation - Torch Dtype float16
|
177 |
+
- transformers: torch_dtype=torch.float16
|
178 |
+
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
|
src/config.py
CHANGED
@@ -82,6 +82,7 @@ class ApplicationConfig:
|
|
82 |
translation_batch_size: int = 2,
|
83 |
translation_no_repeat_ngram_size: int = 3,
|
84 |
translation_num_beams: int = 2,
|
|
|
85 |
# Whisper Segments Filter
|
86 |
whisper_segments_filter: bool = False,
|
87 |
whisper_segments_filters: List[str] = [],
|
@@ -150,6 +151,7 @@ class ApplicationConfig:
|
|
150 |
self.translation_batch_size = translation_batch_size
|
151 |
self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
|
152 |
self.translation_num_beams = translation_num_beams
|
|
|
153 |
# Whisper Segments Filter
|
154 |
self.whisper_segments_filter = whisper_segments_filter
|
155 |
self.whisper_segments_filters = whisper_segments_filters
|
|
|
82 |
translation_batch_size: int = 2,
|
83 |
translation_no_repeat_ngram_size: int = 3,
|
84 |
translation_num_beams: int = 2,
|
85 |
+
translation_torch_dtype_float16: bool = True,
|
86 |
# Whisper Segments Filter
|
87 |
whisper_segments_filter: bool = False,
|
88 |
whisper_segments_filters: List[str] = [],
|
|
|
151 |
self.translation_batch_size = translation_batch_size
|
152 |
self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
|
153 |
self.translation_num_beams = translation_num_beams
|
154 |
+
self.translation_torch_dtype_float16 = translation_torch_dtype_float16
|
155 |
# Whisper Segments Filter
|
156 |
self.whisper_segments_filter = whisper_segments_filter
|
157 |
self.whisper_segments_filters = whisper_segments_filters
|
src/translation/translationModel.py
CHANGED
@@ -21,11 +21,12 @@ class TranslationModel:
|
|
21 |
batchSize: int = 2,
|
22 |
noRepeatNgramSize: int = 3,
|
23 |
numBeams: int = 2,
|
|
|
24 |
downloadRoot: Optional[str] = None,
|
25 |
localFilesOnly: bool = False,
|
26 |
loadModel: bool = False,
|
27 |
):
|
28 |
-
"""Initializes the M2M100 / Nllb-200 / mt5 model.
|
29 |
|
30 |
Args:
|
31 |
modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
|
@@ -76,8 +77,10 @@ class TranslationModel:
|
|
76 |
device = "cuda" if "ct2" in self.modelPath else "cuda:0"
|
77 |
else:
|
78 |
device = "cpu"
|
|
|
79 |
|
80 |
self.device = device
|
|
|
81 |
|
82 |
if loadModel:
|
83 |
self.load_model()
|
@@ -85,8 +88,31 @@ class TranslationModel:
|
|
85 |
def load_model(self):
|
86 |
"""
|
87 |
[from_pretrained]
|
88 |
-
low_cpu_mem_usage(bool, optional)
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
[transformers.AutoTokenizer.from_pretrained]
|
92 |
use_fast (bool, optional, defaults to True):
|
@@ -166,7 +192,7 @@ class TranslationModel:
|
|
166 |
elif "mt5" in self.modelPath:
|
167 |
self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
|
168 |
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
|
169 |
-
self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True)
|
170 |
self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
|
171 |
elif "ALMA" in self.modelPath:
|
172 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
@@ -185,18 +211,25 @@ class TranslationModel:
|
|
185 |
import ctransformers
|
186 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
|
187 |
if self.device == "cpu":
|
188 |
-
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file)
|
189 |
else:
|
190 |
-
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50)
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
else:
|
193 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
|
194 |
-
self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
|
195 |
if "m2m100" in self.modelPath:
|
196 |
self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
|
197 |
else: #NLLB
|
198 |
self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
|
199 |
-
|
200 |
except Exception as e:
|
201 |
self.release_vram()
|
202 |
raise e
|
@@ -223,13 +256,13 @@ class TranslationModel:
|
|
223 |
del self.transTokenizer
|
224 |
if getattr(self, "transModel", None) is not None:
|
225 |
del self.transModel
|
|
|
|
|
226 |
try:
|
227 |
torch.cuda.empty_cache()
|
228 |
except Exception as e:
|
229 |
print(traceback.format_exc())
|
230 |
print("\tcuda empty cache, error: " + str(e))
|
231 |
-
import gc
|
232 |
-
gc.collect()
|
233 |
print("release vram end.")
|
234 |
except Exception as e:
|
235 |
print(traceback.format_exc())
|
@@ -294,6 +327,12 @@ class TranslationModel:
|
|
294 |
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
295 |
elif "GGUF" in self.modelPath:
|
296 |
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
result = output[0]['generated_text']
|
298 |
else: #M2M100 & NLLB
|
299 |
output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
|
@@ -356,9 +395,6 @@ def download_model(
|
|
356 |
"pytorch_model.bin",
|
357 |
"pytorch_model.bin.index.json",
|
358 |
"pytorch_model-*.bin",
|
359 |
-
"pytorch_model-00001-of-00003.bin",
|
360 |
-
"pytorch_model-00002-of-00003.bin",
|
361 |
-
"pytorch_model-00003-of-00003.bin",
|
362 |
"sentencepiece.bpe.model",
|
363 |
"tokenizer.json",
|
364 |
"tokenizer_config.json",
|
@@ -368,6 +404,8 @@ def download_model(
|
|
368 |
"spiece.model",
|
369 |
"vocab.json", #m2m100
|
370 |
"model.safetensors",
|
|
|
|
|
371 |
"quantize_config.json",
|
372 |
"tokenizer.model",
|
373 |
"vocabulary.json"
|
|
|
21 |
batchSize: int = 2,
|
22 |
noRepeatNgramSize: int = 3,
|
23 |
numBeams: int = 2,
|
24 |
+
torchDtypeFloat16: bool = True,
|
25 |
downloadRoot: Optional[str] = None,
|
26 |
localFilesOnly: bool = False,
|
27 |
loadModel: bool = False,
|
28 |
):
|
29 |
+
"""Initializes the M2M100 / Nllb-200 / mt5 / ALMA / madlad400 translation model.
|
30 |
|
31 |
Args:
|
32 |
modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
|
|
|
77 |
device = "cuda" if "ct2" in self.modelPath else "cuda:0"
|
78 |
else:
|
79 |
device = "cpu"
|
80 |
+
torchDtypeFloat16 = False
|
81 |
|
82 |
self.device = device
|
83 |
+
self.torchDtypeFloat16 = torchDtypeFloat16
|
84 |
|
85 |
if loadModel:
|
86 |
self.load_model()
|
|
|
88 |
def load_model(self):
|
89 |
"""
|
90 |
[from_pretrained]
|
91 |
+
low_cpu_mem_usage(bool, optional):
|
92 |
+
Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
|
93 |
+
|
94 |
+
torch_dtype (str or torch.dtype, optional):
|
95 |
+
Override the default torch.dtype and load the model under a specific dtype. The different options are:
|
96 |
+
1. torch.float16 or torch.bfloat16 or torch.float: load in a specified dtype, ignoring the model’s config.torch_dtype if one exists.
|
97 |
+
If not specified the model will get loaded in torch.float (fp32).
|
98 |
+
2. "auto" - A torch_dtype entry in the config.json file of the model will be attempted to be used.
|
99 |
+
If this entry isn’t found then next check the dtype of the first weight in the checkpoint that’s of a floating point type and use that as dtype.
|
100 |
+
This will load the model using the dtype it was saved in at the end of the training. It can’t be used as an indicator of how the model was trained.
|
101 |
+
Since it could be trained in one of half precision dtypes, but saved in fp32.
|
102 |
+
For some models the dtype they were trained in is unknown - you may try to check the model’s paper or reach out to the authors and
|
103 |
+
ask them to add this information to the model’s card and to insert the torch_dtype entry in config.json on the hub.
|
104 |
+
|
105 |
+
device_map (str or Dict[str, Union[int, str, torch.device]] or int or torch.device, optional):
|
106 |
+
A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer name,
|
107 |
+
once a given module name is inside, every submodule of it will be sent to the same device.
|
108 |
+
If we only pass the device (e.g., "cpu", "cuda:1", "mps", or a GPU ordinal rank like 1) on which the model will be allocated,
|
109 |
+
the device map will map the entire model to this device. Passing device_map = 0 means put the whole model on GPU 0.
|
110 |
+
To have Accelerate compute the most optimized device_map automatically, set device_map="auto". For more information about each option see designing a device map.
|
111 |
+
|
112 |
+
load_in_8bit (bool, optional, defaults to False)
|
113 |
+
If True, will convert the loaded model into mixed-8bit quantized model. To use this feature please install bitsandbytes (pip install -U bitsandbytes).
|
114 |
+
load_in_4bit (bool, optional, defaults to False)
|
115 |
+
If True, will convert the loaded model into 4bit precision quantized model. To use this feature install the latest version of bitsandbytes (pip install -U bitsandbytes).
|
116 |
|
117 |
[transformers.AutoTokenizer.from_pretrained]
|
118 |
use_fast (bool, optional, defaults to True):
|
|
|
192 |
elif "mt5" in self.modelPath:
|
193 |
self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
|
194 |
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
|
195 |
+
self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
|
196 |
self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
|
197 |
elif "ALMA" in self.modelPath:
|
198 |
self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
|
|
|
211 |
import ctransformers
|
212 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
|
213 |
if self.device == "cpu":
|
214 |
+
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, low_cpu_mem_usage=True)
|
215 |
else:
|
216 |
+
self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50, low_cpu_mem_usage=True)
|
217 |
+
else:
|
218 |
+
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
|
219 |
+
self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
|
220 |
+
self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, device=self.device if "GPTQ" not in self.modelPath and "GGUF" not in self.modelPath else None, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
|
221 |
+
elif "madlad400" in self.modelPath:
|
222 |
+
self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
|
223 |
+
self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False)
|
224 |
+
self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
|
225 |
+
self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
|
226 |
else:
|
227 |
self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
|
228 |
+
self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
|
229 |
if "m2m100" in self.modelPath:
|
230 |
self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
|
231 |
else: #NLLB
|
232 |
self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
|
|
|
233 |
except Exception as e:
|
234 |
self.release_vram()
|
235 |
raise e
|
|
|
256 |
del self.transTokenizer
|
257 |
if getattr(self, "transModel", None) is not None:
|
258 |
del self.transModel
|
259 |
+
import gc
|
260 |
+
gc.collect()
|
261 |
try:
|
262 |
torch.cuda.empty_cache()
|
263 |
except Exception as e:
|
264 |
print(traceback.format_exc())
|
265 |
print("\tcuda empty cache, error: " + str(e))
|
|
|
|
|
266 |
print("release vram end.")
|
267 |
except Exception as e:
|
268 |
print(traceback.format_exc())
|
|
|
327 |
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
328 |
elif "GGUF" in self.modelPath:
|
329 |
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
330 |
+
else:
|
331 |
+
output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
|
332 |
+
|
333 |
+
result = output[0]['generated_text']
|
334 |
+
elif "madlad400" in self.modelPath:
|
335 |
+
output = self.transTranslator(self.madlad400Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
|
336 |
result = output[0]['generated_text']
|
337 |
else: #M2M100 & NLLB
|
338 |
output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
|
|
|
395 |
"pytorch_model.bin",
|
396 |
"pytorch_model.bin.index.json",
|
397 |
"pytorch_model-*.bin",
|
|
|
|
|
|
|
398 |
"sentencepiece.bpe.model",
|
399 |
"tokenizer.json",
|
400 |
"tokenizer_config.json",
|
|
|
404 |
"spiece.model",
|
405 |
"vocab.json", #m2m100
|
406 |
"model.safetensors",
|
407 |
+
"model-*.safetensors",
|
408 |
+
"model.safetensors.index.json",
|
409 |
"quantize_config.json",
|
410 |
"tokenizer.model",
|
411 |
"vocabulary.json"
|