Spaces:
Sleeping
Sleeping
import gradio as gr | |
import onnxruntime | |
import numpy as np | |
from transformers import AutoTokenizer | |
import time | |
import os | |
from huggingface_hub import hf_hub_download | |
model_name = "skypro1111/mbart-large-50-verbalization" | |
# Example inputs for the dropdown | |
EXAMPLES = [ | |
["мій телефон 0979456822"], | |
["квартира площею 11 тис кв м."], | |
["Пропонували хабар у 1 млрд грн."], | |
["1 2 3 4 5 6 7 8 9 10."], | |
["Крім того, парламентарій володіє шістьма ділянками землі (дві площею 25000 кв м, дві по 15000 кв м та дві по 10000 кв м) розташованими в Сосновій Балці Луганської області."], | |
["Підписуючи цей документ у 2003 році, голови Росії та України мали намір зміцнити співпрацю та сприяти розширенню двосторонніх відносин."], | |
["Очікується, що цей застосунок буде запущено 22.08.2025."], | |
["За інформацією від Державної служби з надзвичайних ситуацій станом на 7 ранку 15 липня."], | |
] | |
def download_model_from_hf(repo_id=model_name, model_dir="./"): | |
"""Download ONNX models from HuggingFace Hub.""" | |
files = ["onnx/encoder_model.onnx", "onnx/decoder_model.onnx", "onnx/decoder_model.onnx_data"] | |
for file in files: | |
hf_hub_download( | |
repo_id=repo_id, | |
filename=file, | |
local_dir=model_dir, | |
) | |
return files | |
def create_onnx_session(model_path, use_gpu=True): | |
"""Create an ONNX inference session.""" | |
session_options = onnxruntime.SessionOptions() | |
session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL | |
session_options.enable_mem_pattern = True | |
session_options.enable_mem_reuse = True | |
session_options.intra_op_num_threads = 8 | |
session_options.log_severity_level = 1 | |
cuda_provider_options = { | |
'device_id': 0, | |
'arena_extend_strategy': 'kSameAsRequested', | |
'gpu_mem_limit': 0, # 0 means no limit | |
'cudnn_conv_algo_search': 'DEFAULT', | |
'do_copy_in_default_stream': True, | |
} | |
if use_gpu and 'CUDAExecutionProvider' in onnxruntime.get_available_providers(): | |
providers = [('CUDAExecutionProvider', cuda_provider_options)] | |
else: | |
providers = ['CPUExecutionProvider'] | |
session = onnxruntime.InferenceSession( | |
model_path, | |
providers=providers, | |
sess_options=session_options | |
) | |
return session | |
def generate_text(text, tokenizer, encoder_session, decoder_session, max_length=128): | |
"""Generate text for a single input.""" | |
# Prepare input | |
inputs = tokenizer(text, return_tensors="np", padding=True, truncation=True, max_length=512) | |
input_ids = inputs["input_ids"].astype(np.int64) | |
attention_mask = inputs["attention_mask"].astype(np.int64) | |
# Run encoder | |
encoder_outputs = encoder_session.run( | |
output_names=["last_hidden_state"], | |
input_feed={ | |
"input_ids": input_ids, | |
"attention_mask": attention_mask, | |
} | |
)[0] | |
# Initialize decoder input | |
decoder_input_ids = np.array([[tokenizer.pad_token_id]], dtype=np.int64) | |
# Generate sequence | |
for _ in range(max_length): | |
# Run decoder | |
decoder_outputs = decoder_session.run( | |
output_names=["logits"], | |
input_feed={ | |
"input_ids": decoder_input_ids, | |
"encoder_hidden_states": encoder_outputs, | |
"encoder_attention_mask": attention_mask, | |
} | |
)[0] | |
# Get next token | |
next_token = decoder_outputs[:, -1:].argmax(axis=-1) | |
decoder_input_ids = np.concatenate([decoder_input_ids, next_token], axis=-1) | |
# Check if sequence is complete | |
if tokenizer.eos_token_id in decoder_input_ids[0]: | |
break | |
# Decode sequence | |
output_text = tokenizer.decode(decoder_input_ids[0], skip_special_tokens=True) | |
return output_text | |
# Initialize models and tokenizer globally | |
print("Downloading models...") | |
files = download_model_from_hf() | |
print("Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.src_lang = "uk_UA" | |
tokenizer.tgt_lang = "uk_UA" | |
print("Creating ONNX sessions...") | |
encoder_session = create_onnx_session("onnx/encoder_model.onnx") | |
decoder_session = create_onnx_session("onnx/decoder_model.onnx") | |
def inference(text): | |
"""Gradio inference function""" | |
start_time = time.time() | |
# Generate text | |
output = generate_text(text, tokenizer, encoder_session, decoder_session) | |
# Calculate inference time | |
inference_time = time.time() - start_time | |
return output, f"{inference_time:.2f} seconds" | |
# Create Gradio interface | |
with gr.Blocks(title="Numbers to Words ONNX Inference") as demo: | |
gr.Markdown("# Numbers to Words ONNX Inference") | |
gr.Markdown("Convert numbers in Ukrainian text to words using ONNX optimized model") | |
with gr.Row(): | |
with gr.Column(): | |
input_text = gr.Textbox( | |
label="Input Text", | |
placeholder="Enter Ukrainian text with numbers...", | |
lines=3 | |
) | |
inference_button = gr.Button("Run Inference", variant="primary") | |
with gr.Column(): | |
output_text = gr.Textbox( | |
label="Output Text", | |
lines=3, | |
interactive=False | |
) | |
inference_time = gr.Textbox( | |
label="Inference Time", | |
interactive=False | |
) | |
# Add examples | |
gr.Examples( | |
examples=EXAMPLES, | |
inputs=input_text, | |
label="Example Inputs" | |
) | |
# Set up inference button click event | |
inference_button.click( | |
fn=inference, | |
inputs=input_text, | |
outputs=[output_text, inference_time] | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True) |