File size: 8,702 Bytes
dc149ba 3ad5346 dc149ba 8c5885d 158e38e 8c5885d dc149ba e939d89 93baa69 dc149ba e939d89 b0ae3fd b2d474f 7239594 e939d89 93baa69 9e1c3ab e939d89 9e1c3ab dc149ba 8c5885d 46abd0a 8c5885d 46abd0a 8c5885d 5e8469e 8c5885d 98b9978 31097f0 3ad5346 5e8469e 98b9978 5e8469e 98b9978 8c5885d 98b9978 dc149ba e939d89 592978b dc149ba 2866119 dc149ba 2866119 dc149ba 2866119 dc149ba de3cada b52f918 8112e48 f010b24 31097f0 592978b 3ad5346 592978b 9215493 dc149ba cc4bac3 a71436a bccf7be 33e3967 5b11a3e bccf7be 33e3967 bccf7be 33e3967 bccf7be 33e3967 bccf7be a71436a 93baa69 d895aa7 93baa69 c603514 93baa69 c603514 93baa69 765a96d 93baa69 e3a8bf2 765a96d 9e1c3ab 765a96d d895aa7 9e1c3ab d895aa7 9e1c3ab f64ffeb 9c13f88 f64ffeb 9c13f88 765a96d 9c13f88 ea13813 765a96d ea13813 765a96d 9e1c3ab 765a96d d895aa7 9e1c3ab 93baa69 de3cada c603514 72801d9 93baa69 9e1c3ab 33e3967 de3cada 8c5885d 90e8b1d 76ef102 27f72d1 76ef102 90e8b1d 65fbb2d de3cada d895aa7 de3cada 8c5885d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
"""
translation program for simple text
1. detect language from langdetect
2. translate to target language given by user
Example from
https://www.thepythoncode.com/article/machine-translation-using-huggingface-transformers-in-python
user_input:
string: string to be translated
target_lang: language to be translated to
Returns:
string: translated string of text
try this : https://pypi.org/project/EasyNMT/
and this : https://huggingface.co/IDEA-CCNL/Randeng-Deltalm-362M-En-Zh
"""
from __future__ import annotations
from typing import Iterable
import gradio as gr
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes
import argparse
import langid
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, logging
from easynmt import EasyNMT
# Initialize logging
logging.set_verbosity_info()
logger = logging.get_logger("transformers")
# # Initialize nllb-200 models
# tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
# Initialize mbart50 models
mbart_m2en_model = EasyNMT("mbart50_m2en")
mbart_en2m_model = EasyNMT("mbart50_en2m")
logger.info("mbart50 models initialized")
# Initialize m2m_100 models
m2m_model = EasyNMT("m2m_100_1.2B")
logger.info("m2m_100 models initialized")
class myTheme(Base):
def __init__(
self,
*,
primary_hue: colors.Color | str = colors.red,
secondary_hue: colors.Color | str = colors.blue,
neutral_hue: colors.Color | str = colors.orange,
spacing_size: sizes.Size | str = sizes.spacing_md,
radius_size: sizes.Size | str = sizes.radius_md,
text_size: sizes.Size | str = sizes.text_lg,
font: fonts.Font
| str
| Iterable[fonts.Font | str] = (
fonts.GoogleFont("handjet"),
"cursive",
# "sans-serif",
),
font_mono: fonts.Font
| str
| Iterable[fonts.Font | str] = (
fonts.GoogleFont("IBM Plex Mono"),
"ui-monospace",
"monospace",
),
):
super().__init__(
primary_hue=primary_hue,
secondary_hue=secondary_hue,
neutral_hue=neutral_hue,
spacing_size=spacing_size,
radius_size=radius_size,
text_size=text_size,
font=font,
font_mono=font_mono,
)
super().set(
body_background_fill="repeating-linear-gradient(135deg, *primary_800, *primary_800 10px, *primary_900 10px, *primary_900 20px)",
button_primary_background_fill="linear-gradient(90deg, *primary_600, *secondary_800)",
button_primary_background_fill_hover="linear-gradient(45deg, *primary_200, *secondary_300)",
button_primary_text_color="white",
slider_color="*secondary_300",
slider_color_dark="*secondary_600",
block_title_text_weight="600",
block_border_width="3px",
block_shadow="*shadow_drop_lg",
button_shadow="*shadow_drop_lg",
button_large_padding="24px",
)
def detect_lang(article):
"""
Language Detection using library langid
Args:
article (string): article that user wish to translate
target_lang (string): language user want to translate article into
Returns:
string: detected language short form
"""
result_lang = langid.classify(article)
logger.info(f"language detected as {result_lang}")
return result_lang[0]
def opus_trans(article, target_language):
"""
Translation by Helsinki-NLP model
Args:
article (string): article that user wishes to translate
target_language (string): language that user wishes to translate article into
Returns:
string: translated piece of article based off target_language
"""
result_lang = detect_lang(article)
if target_language == "English":
target_lang = "en"
elif target_language == "Chinese":
target_lang = "zh"
if result_lang != target_lang:
task_name = f"translation_{result_lang}_to_{target_lang}"
model_name = f"Helsinki-NLP/opus-mt-{result_lang}-{target_lang}"
try:
translator = pipeline(task_name, model=model_name, tokenizer=model_name)
translated = translator(article)[0]["translation_text"]
except:
translated = "Error: Model doesn't exist"
else:
translated = "Error: You chose the same language as the article detected language. Please reselect language and try again."
return translated
def nllb_trans(article, target_language):
result_lang = detect_lang(article)
inputs = tokenizer(article, return_tensors="pt")
if target_language == "English":
target_lang = "eng_Latn"
target_language = "en"
elif target_language == "Chinese":
target_lang = "zho_Hans"
target_language = "zh"
if result_lang != target_language:
translated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_code_to_id[target_lang],
max_length=30,
)
translated = tokenizer.batch_decode(
translated_tokens, skip_special_tokens=True
)[0]
else:
translated = "Error: You chose the same language as the article detected language. Please reselect language and try again."
return translated
def mbart_trans(article, target_language):
logger.info(f"Article to translate : {article}")
logger.info(f"Chose which translation model: mbart model")
logger.info(f"Language selected: {target_language}")
result_lang = detect_lang(article)
if target_language == "English":
target_lang = "en"
elif target_language == "Chinese":
target_lang = "zh"
if result_lang != target_lang:
if target_language == "English":
translated = mbart_m2en_model.translate(article, target_lang="en")
logger.info(f"Translated Result: {translated}")
return translated
else:
translated = mbart_en2m_model.translate(article, target_lang="zh")
logger.info(f"Translated Result: {translated}")
return translated
else:
logger.warning(
"Error: You chose the same language as the article detected language. Please reselect language and try again."
)
return "Error: You chose the same language as the article detected language. Please reselect language and try again."
def m2m_trans(article, target_language):
logger.info(f"Article to translate : {article}")
logger.info(f"Chose which translation model: m2m model")
logger.info(f"Language selected: {target_language}")
result_lang = detect_lang(article)
if target_language == "English":
target_lang = "en"
elif target_language == "Chinese":
target_lang = "zh"
if result_lang != target_lang:
if target_language == "English":
translated = m2m_model.translate(article, target_lang)
logger.info(f"Translation Result: {translated}")
return translated
elif target_language == "Chinese":
translated = m2m_model.translate(article, target_lang)
logger.info(f"Translation Result: {translated}")
return translated
else:
logger.warning(
f"Error: You chose the same language as the article detected language. Please reselect language and try again."
)
return "Error: You chose the same language as the article detected language. Please reselect language and try again."
def translate(article, toolkit, target_language):
# if toolkit == "OPUS":
# translated = opus_trans(article, target_language)
# if toolkit == "NLLB":
# translated = nllb_trans(article, target_language)
if toolkit == "MBART":
translated = mbart_trans(article, target_language)
elif toolkit == "M2M":
translated = m2m_trans(article, target_language)
return translated
myTheme = myTheme()
with gr.Blocks(theme=myTheme) as demo:
article = gr.Textbox(label="Article")
toolkit_select = gr.Radio(
["MBART", "M2M"], label="Select Translation Model", value="MBART"
)
lang_select = gr.Radio(["English", "Chinese"], label="Select Desired Language")
result = gr.Textbox(label="Translated Result")
trans_btn = gr.Button("Translate")
trans_btn.click(
fn=translate, inputs=[article, toolkit_select, lang_select], outputs=result
)
demo.launch()
|